def sanity_check(regions):
    spot_requests = []
    for r in regions:
        conn = get_aws_connection(r)
        region_spot_requests = conn.get_all_spot_instance_requests()
        if region_spot_requests:
            spot_requests.extend(region_spot_requests)
    all_spot_instances = aws_get_all_instances(regions)
    instance_ids = [i.id for i in all_spot_instances]

    for req in spot_requests:
        if req.state in ["open", "failed"]:
            if req.status.code in CANCEL_STATUS_CODES:
                log.info("Cancelling request %s", req)
                retry_aws_request(req.add_tag, "moz-cancel-reason", req.status.code)
                req.cancel()
            elif req.status.code not in IGNORABLE_STATUS_CODES:
                log.error("Uknown status for request %s: %s", req,
                          req.status.code)
        # Cancel all active request older than 30 mins without runing instances
        elif req.state == "active" and \
                parse_aws_time(req.create_time) + 30 * 60 < time.time() and \
                req.instance_id not in instance_ids:
            log.info("Cancelling request %s: %s is not running", req,
                     req.instance_id)
            retry_aws_request(req.add_tag, "moz-cancel-reason", "no-running-instances")
            req.cancel()
예제 #2
0
def sanity_check(regions):
    spot_requests = []
    for r in regions:
        conn = get_aws_connection(r)
        region_spot_requests = conn.get_all_spot_instance_requests()
        if region_spot_requests:
            spot_requests.extend(region_spot_requests)
    all_spot_instances = aws_get_all_instances(regions)
    instance_ids = [i.id for i in all_spot_instances]

    for req in spot_requests:
        if req.state in ["open", "failed"]:
            if req.status.code in CANCEL_STATUS_CODES:
                log.info("Cancelling request %s", req)
                retry_aws_request(req.add_tag, "moz-cancel-reason",
                                  req.status.code)
                req.cancel()
            elif req.status.code not in IGNORABLE_STATUS_CODES:
                log.error("Uknown status for request %s: %s", req,
                          req.status.code)
        # Cancel all active request older than 30 mins without runing instances
        elif req.state == "active" and \
                parse_aws_time(req.create_time) + 30 * 60 < time.time() and \
                req.instance_id not in instance_ids:
            log.info("Cancelling request %s: %s is not running", req,
                     req.instance_id)
            retry_aws_request(req.add_tag, "moz-cancel-reason",
                              "no-running-instances")
            req.cancel()
def aws_watch_pending(dburl, regions, builder_map, region_priorities,
                      spot_config, ondemand_config, dryrun,
                      latest_ami_percentage):
    # First find pending jobs in the db
    pending = find_pending(dburl)

    if not pending:
        gr_log.add("pending", 0)
        log.debug("no pending jobs! all done!")
        return

    log.debug("processing %i pending jobs", len(pending))
    gr_log.add("pending", len(pending))

    # Mapping of (instance types, slaveset) to # of instances we want to
    # creates
    # Map pending builder names to instance types
    pending_builder_map = map_builders(pending, builder_map)
    gr_log.add("aws_pending", sum(pending_builder_map.values()))
    if not pending_builder_map:
        log.debug("no pending jobs we can do anything about! all done!")
        return

    to_create_spot = pending_builder_map
    to_create_ondemand = defaultdict(int)

    # For each moz_instance_type, slaveset, find how many are currently
    # running, and scale our count accordingly
    all_instances = aws_get_all_instances(regions)
    cloudtools.graphite.generate_instance_stats(all_instances)

    # Reduce the requirements, pay attention to freshess and running instances
    to_delete = set()
    for (moz_instance_type, slaveset), count in to_create_spot.iteritems():
        running = filter_instances_by_slaveset(
            aws_get_running_instances(all_instances, moz_instance_type),
            slaveset)
        spot_running = filter_spot_instances(running)
        to_create_spot[moz_instance_type,
                       slaveset] = reduce_by_freshness(count, spot_running,
                                                       moz_instance_type,
                                                       slaveset)

        if to_create_spot[moz_instance_type, slaveset] == 0:
            log.debug("removing requirement for %s %s %s", "spot",
                      moz_instance_type, slaveset)
            to_delete.add((moz_instance_type, slaveset))

        # If slaveset is not None, and all our slaves are running, we should
        # remove it from the set of things to try and start instances for
        if slaveset and \
                slaveset.issubset(
                    set(i.tags.get('Name') for i in spot_running)):
            log.debug("removing %s %s since all the slaves are running",
                      moz_instance_type, slaveset)
            to_delete.add((moz_instance_type, slaveset))

    for moz_instance_type, slaveset in to_delete:
        del to_create_spot[moz_instance_type, slaveset]

    for (moz_instance_type, slaveset), count in to_create_spot.iteritems():
        log.debug("need %i spot %s for slaveset %s", count, moz_instance_type,
                  slaveset)
        # Cap by our global limits if applicable
        if spot_config and 'global' in spot_config.get('limits', {}):
            global_limit = spot_config['limits']['global'].get(
                moz_instance_type)
            # How many of this type of spot instance are running?
            n = len(
                filter_spot_instances(
                    aws_get_running_instances(all_instances,
                                              moz_instance_type)))
            log.debug("%i %s spot instances running globally", n,
                      moz_instance_type)
            if global_limit and n + count > global_limit:
                new_count = max(0, global_limit - n)
                log.debug(
                    "decreasing requested number of %s from %i to %i (%i out of %i running)",
                    moz_instance_type, count, new_count, n, global_limit)
                count = new_count
                if count <= 0:
                    continue

        started = request_spot_instances(
            all_instances,
            moz_instance_type=moz_instance_type,
            start_count=count,
            regions=regions,
            region_priorities=region_priorities,
            spot_config=spot_config,
            dryrun=dryrun,
            slaveset=slaveset,
            latest_ami_percentage=latest_ami_percentage)
        count -= started
        log.debug("%s - started %i spot instances for slaveset %s; need %i",
                  moz_instance_type, started, slaveset, count)
        gr_log.add("need.{moz_instance_type}.{jacuzzi_type}".format(
            moz_instance_type=moz_instance_type,
            jacuzzi_type=jacuzzi_suffix(slaveset)),
                   count,
                   collect=True)

        # Add leftover to ondemand
        to_create_ondemand[moz_instance_type, slaveset] += count

    for (moz_instance_type, slaveset), count in to_create_ondemand.iteritems():
        log.debug("need %i ondemand %s for slaveset %s", count,
                  moz_instance_type, slaveset)
        # Cap by our global limits if applicable
        if ondemand_config and 'global' in ondemand_config.get('limits', {}):
            global_limit = ondemand_config['limits']['global'].get(
                moz_instance_type)
            # How many of this type of ondemand instance are running?
            n = len(
                filter_ondemand_instances(
                    aws_get_running_instances(all_instances,
                                              moz_instance_type)))
            log.debug("%i %s ondemand instances running globally", n,
                      moz_instance_type)
            if global_limit and n + count > global_limit:
                new_count = max(0, global_limit - n)
                log.debug(
                    "decreasing requested number of %s from %i to %i (%i out of %i running)",
                    moz_instance_type, count, new_count, n, global_limit)
                count = new_count
                if count <= 0:
                    continue
        if count < 1:
            continue

        # Check for stopped instances in the given regions and start them if
        # there are any
        started = aws_resume_instances(all_instances, moz_instance_type, count,
                                       regions, region_priorities, dryrun,
                                       slaveset)
        count -= started
        log.debug("%s - started %i instances for slaveset %s; need %i",
                  moz_instance_type, started, slaveset, count)
def aws_watch_pending(dburl, regions, builder_map, region_priorities,
                      spot_config, ondemand_config, dryrun, latest_ami_percentage):
    # First find pending jobs in the db
    pending = find_pending(dburl)

    if not pending:
        gr_log.add("pending", 0)
        log.debug("no pending jobs! all done!")
        return

    log.debug("processing %i pending jobs", len(pending))
    gr_log.add("pending", len(pending))

    # Mapping of instance types to # of instances we want to
    # creates
    # Map pending builder names to instance types
    pending_builder_map = map_builders(pending, builder_map)
    gr_log.add("aws_pending", sum(pending_builder_map.values()))
    if not pending_builder_map:
        log.debug("no pending jobs we can do anything about! all done!")
        return

    to_create_spot = pending_builder_map
    to_create_ondemand = defaultdict(int)

    # For each moz_instance_type find how many are currently
    # running, and scale our count accordingly
    all_instances = aws_get_all_instances(regions)
    cloudtools.graphite.generate_instance_stats(all_instances)

    # Reduce the requirements, pay attention to freshess and running instances
    to_delete = set()
    for moz_instance_type, count in to_create_spot.iteritems():
        running = aws_get_running_instances(all_instances, moz_instance_type)
        spot_running = filter_spot_instances(running)
        to_create_spot[moz_instance_type] = reduce_by_freshness(
            count, spot_running, moz_instance_type)

        if to_create_spot[moz_instance_type] == 0:
            log.debug("removing requirement for %s %s", "spot",
                      moz_instance_type)
            to_delete.add((moz_instance_type))

    for moz_instance_type in to_delete:
        del to_create_spot[moz_instance_type]

    for (moz_instance_type), count in to_create_spot.iteritems():
        log.debug("need %i spot %s", count, moz_instance_type)
        # Cap by our global limits if applicable
        if spot_config and 'global' in spot_config.get('limits', {}):
            global_limit = spot_config['limits']['global'].get(moz_instance_type)
            # How many of this type of spot instance are running?
            n = len(filter_spot_instances(aws_get_running_instances(all_instances, moz_instance_type)))
            log.debug("%i %s spot instances running globally", n, moz_instance_type)
            if global_limit and n + count > global_limit:
                new_count = max(0, global_limit - n)
                log.debug("decreasing requested number of %s from %i to %i (%i out of %i running)", moz_instance_type, count, new_count, n, global_limit)
                count = new_count
                if count <= 0:
                    continue

        started = request_spot_instances(
            all_instances,
            moz_instance_type=moz_instance_type, start_count=count,
            regions=regions, region_priorities=region_priorities,
            spot_config=spot_config, dryrun=dryrun,
            latest_ami_percentage=latest_ami_percentage)
        count -= started
        log.debug("%s - started %i spot instances; need %i",
                  moz_instance_type, started, count)

        # Add leftover to ondemand
        to_create_ondemand[moz_instance_type] += count

    for moz_instance_type, count in to_create_ondemand.iteritems():
        log.debug("need %i ondemand %s", count,
                  moz_instance_type)
        # Cap by our global limits if applicable
        if ondemand_config and 'global' in ondemand_config.get('limits', {}):
            global_limit = ondemand_config['limits']['global'].get(moz_instance_type)
            # How many of this type of ondemand instance are running?
            n = len(filter_ondemand_instances(aws_get_running_instances(all_instances, moz_instance_type)))
            log.debug("%i %s ondemand instances running globally", n, moz_instance_type)
            if global_limit and n + count > global_limit:
                new_count = max(0, global_limit - n)
                log.debug("decreasing requested number of %s from %i to %i (%i out of %i running)", moz_instance_type, count, new_count, n, global_limit)
                count = new_count
                if count <= 0:
                    continue
        if count < 1:
            continue

        # Check for stopped instances in the given regions and start them if
        # there are any
        started = aws_resume_instances(all_instances, moz_instance_type, count,
                                       regions, region_priorities,
                                       dryrun)
        count -= started
        log.debug("%s - started %i instances; need %i",
                  moz_instance_type, started, count)