Example #1
0
def _retrieve_max_cluster_size(sqs_config, asg_name, fallback):
    try:
        _, _, max_size = get_asg_settings(sqs_config.region,
                                          sqs_config.proxy_config, asg_name)
        return max_size
    except Exception:
        return fallback
Example #2
0
def _poll_scheduler_status(config, asg_name, scheduler_module,
                           instance_properties):
    """
    Verify scheduler status and ask the ASG new nodes, if required.

    :param config: JobwatcherConfig object
    :param asg_name: ASG name
    :param scheduler_module: scheduler module
    :param instance_properties: instance properties
    """
    while True:
        # Get number of nodes requested
        pending = scheduler_module.get_required_nodes(instance_properties)

        if pending < 0:
            log.critical(
                "Error detecting number of required nodes. The cluster will not scale up."
            )

        elif pending == 0:
            log.info("There are no pending jobs. Noop.")

        else:
            # Get current number of nodes
            running = scheduler_module.get_busy_nodes(instance_properties)
            log.info("%d nodes requested, %d nodes running", pending, running)

            # get current limits
            _, current_desired, max_size = get_asg_settings(
                config.region, config.proxy_config, asg_name, log)

            # Check to make sure requested number of instances is within ASG limits
            required = running + pending
            if required <= current_desired:
                log.info("%d nodes required, %d nodes in asg. Noop" %
                         (required, current_desired))
            else:
                if required > max_size:
                    log.info(
                        "The number of required nodes %d is greater than max %d. Requesting max %d."
                        % (required, max_size, max_size))
                else:
                    log.info(
                        "Setting desired to %d nodes, requesting %d more nodes from asg."
                        % (required, required - current_desired))
                requested = min(required, max_size)

                # update ASG
                asg_client = boto3.client('autoscaling',
                                          region_name=config.region,
                                          config=config.proxy_config)
                asg_client.update_auto_scaling_group(
                    AutoScalingGroupName=asg_name, DesiredCapacity=requested)

        time.sleep(60)
Example #3
0
def _poll_scheduler_status(config, asg_name, scheduler_module, instance_properties):
    """
    Verify scheduler status and ask the ASG new nodes, if required.

    :param config: JobwatcherConfig object
    :param asg_name: ASG name
    :param scheduler_module: scheduler module
    :param instance_properties: instance properties
    """
    while True:
        # Get number of nodes requested
        pending = scheduler_module.get_required_nodes(instance_properties)

        if pending < 0:
            log.critical("Error detecting number of required nodes. The cluster will not scale up.")

        elif pending == 0:
            log.info("There are no pending jobs. Noop.")

        else:
            # Get current number of nodes
            running = scheduler_module.get_busy_nodes(instance_properties)
            log.info("%d nodes requested, %d nodes running", pending, running)

            # get current limits
            _, current_desired, max_size = get_asg_settings(config.region, config.proxy_config, asg_name, log)

            # Check to make sure requested number of instances is within ASG limits
            required = running + pending
            if required <= current_desired:
                log.info("%d nodes required, %d nodes in asg. Noop" % (required, current_desired))
            else:
                if required > max_size:
                    log.info(
                        "The number of required nodes %d is greater than max %d. Requesting max %d."
                        % (required, max_size, max_size)
                    )
                else:
                    log.info(
                        "Setting desired to %d nodes, requesting %d more nodes from asg."
                        % (required, required - current_desired)
                    )
                requested = min(required, max_size)

                # update ASG
                asg_client = boto3.client('autoscaling', region_name=config.region, config=config.proxy_config)
                asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, DesiredCapacity=requested)

        time.sleep(60)
Example #4
0
def _poll_scheduler_status(config, asg_name, scheduler_module):
    """
    Verify scheduler status and ask the ASG new nodes, if required.

    :param config: JobwatcherConfig object
    :param asg_name: ASG name
    :param scheduler_module: scheduler module
    """
    instance_type = None
    instance_properties = None
    update_instance_properties_timer = 0
    while True:
        # Get instance properties
        if not instance_properties or update_instance_properties_timer >= UPDATE_INSTANCE_PROPERTIES_INTERVAL:
            logging.info("Refreshing compute instance properties")
            update_instance_properties_timer = 0
            new_instance_type = get_compute_instance_type(
                config.region,
                config.proxy_config,
                config.stack_name,
                fallback=instance_type)
            if new_instance_type != instance_type:
                instance_type = new_instance_type
                instance_properties = get_instance_properties(
                    config.region, config.proxy_config, instance_type)
        update_instance_properties_timer += LOOP_TIME

        # get current limits
        _, current_desired, max_size = get_asg_settings(
            config.region, config.proxy_config, asg_name)

        # Get number of nodes requested
        pending = scheduler_module.get_required_nodes(instance_properties,
                                                      max_size)

        if pending < 0:
            log.critical(
                "Error detecting number of required nodes. The cluster will not scale up."
            )

        elif pending == 0:
            log.info(
                "There are no pending jobs or the requirements on pending jobs cannot be satisfied. Noop."
            )

        else:
            # Get current number of nodes
            running = scheduler_module.get_busy_nodes()
            log.info("%d nodes requested, %d nodes busy or unavailable",
                     pending, running)

            # Check to make sure requested number of instances is within ASG limits
            required = running + pending
            if required <= current_desired:
                log.info("%d nodes required, %d nodes in asg. Noop" %
                         (required, current_desired))
            else:
                if required > max_size:
                    log.info(
                        "The number of required nodes %d is greater than max %d. Requesting max %d."
                        % (required, max_size, max_size))
                else:
                    log.info(
                        "Setting desired to %d nodes, requesting %d more nodes from asg."
                        % (required, required - current_desired))
                requested = min(required, max_size)

                # update ASG
                asg_client = boto3.client("autoscaling",
                                          region_name=config.region,
                                          config=config.proxy_config)
                asg_client.update_auto_scaling_group(
                    AutoScalingGroupName=asg_name, DesiredCapacity=requested)

        time.sleep(LOOP_TIME)
Example #5
0
def _poll_instance_status(config, scheduler_module, asg_name, hostname,
                          instance_id, instance_type):
    """
    Verify instance/scheduler status and self-terminate the instance.

    The instance will be terminate if not required and exceeded the configured scaledown_idletime.
    :param config: NodewatcherConfig object
    :param scheduler_module: scheduler module
    :param asg_name: ASG name
    :param hostname: current hostname
    :param instance_id: current instance id
    :param instance_type: current instance type
    """
    _wait_for_stack_ready(config.stack_name, config.region,
                          config.proxy_config)
    _terminate_if_down(scheduler_module, config, asg_name, instance_id,
                       INITIAL_TERMINATE_TIMEOUT)

    idletime = _init_idletime()
    instance_properties = get_instance_properties(config.region,
                                                  config.proxy_config,
                                                  instance_type)
    start_time = None
    while True:
        sleep_remaining_loop_time(LOOP_TIME, start_time)
        start_time = datetime.now()

        max_cluster_size = _refresh_cluster_properties(config.region,
                                                       config.proxy_config,
                                                       asg_name)

        _store_idletime(idletime)
        _terminate_if_down(scheduler_module, config, asg_name, instance_id,
                           TERMINATE_TIMEOUT)

        has_jobs = _has_jobs(scheduler_module, hostname)
        if has_jobs:
            log.info("Instance has active jobs.")
            idletime = 0
        else:
            has_pending_jobs, error = scheduler_module.has_pending_jobs(
                instance_properties, max_cluster_size)
            if error:
                # In case of failure _terminate_if_down will take care of removing the node
                log.warning(
                    "Encountered an error while polling queue for pending jobs. Considering node as busy"
                )
                continue
            elif has_pending_jobs:
                log.info("Queue has pending jobs. Not terminating instance")
                idletime = 0
                continue

            try:
                min_size, desired_capacity, max_size = get_asg_settings(
                    config.region, config.proxy_config, asg_name)
            except Exception as e:
                logging.error(
                    "Failed when retrieving ASG settings with exception %s", e)
                continue

            if desired_capacity <= min_size:
                log.info("Not terminating due to min cluster size reached")
                idletime = 0
            else:
                idletime += 1
                log.info("Instance had no job for the past %s minute(s)",
                         idletime)

                if idletime >= config.scaledown_idletime:
                    _lock_and_terminate(config.region, config.proxy_config,
                                        scheduler_module, hostname,
                                        instance_id)
                    # _lock_and_terminate exits if termination is successful
                    # set idletime to 0 if termination is aborted
                    idletime = 0
def _poll_instance_status(config, scheduler_module, asg_name, hostname,
                          instance_id, instance_type):
    """
    Verify instance/scheduler status and self-terminate the instance.

    The instance will be terminate if not required and exceeded the configured scaledown_idletime.
    :param config: NodewatcherConfig object
    :param scheduler_module: scheduler module
    :param asg_name: ASG name
    :param hostname: current hostname
    :param instance_id: current instance id
    :param instance_type: current instance type
    """
    _wait_for_stack_ready(config.stack_name, config.region,
                          config.proxy_config)
    _terminate_if_down(scheduler_module, config, asg_name, instance_id,
                       INITIAL_TERMINATE_TIMEOUT)

    idletime = _init_idletime()
    instance_properties = get_instance_properties(config.region,
                                                  config.proxy_config,
                                                  instance_type)
    while True:
        time.sleep(60)
        _store_idletime(idletime)
        _terminate_if_down(scheduler_module, config, asg_name, instance_id,
                           TERMINATE_TIMEOUT)

        has_jobs = _has_jobs(scheduler_module, hostname)
        if has_jobs:
            log.info("Instance has active jobs.")
            idletime = 0
        else:
            asg_conn = boto3.client("autoscaling",
                                    region_name=config.region,
                                    config=config.proxy_config)
            if _maintain_size(asg_name, asg_conn):
                log.info("Not terminating due to min cluster size reached")
                idletime = 0
            else:
                _, _, max_size = get_asg_settings(config.region,
                                                  config.proxy_config,
                                                  asg_name)
                has_pending_jobs, error = scheduler_module.hasPendingJobs(
                    instance_properties, max_size)
                if error:
                    log.warning(
                        "Encountered an error while polling queue for pending jobs. Skipping pending jobs check"
                    )
                elif has_pending_jobs:
                    log.info(
                        "Queue has pending jobs. Not terminating instance")
                    idletime = 0
                    continue

                idletime += 1
                log.info("Instance had no job for the past %s minute(s)",
                         idletime)

                if idletime >= config.scaledown_idletime:
                    _lock_host(scheduler_module, hostname)
                    has_jobs = _has_jobs(scheduler_module, hostname)
                    if has_jobs:
                        log.info("Instance has active jobs.")
                        idletime = 0
                        _lock_host(scheduler_module, hostname, unlock=True)
                        continue

                    if _maintain_size(asg_name, asg_conn):
                        log.info(
                            "Not terminating due to min cluster size reached")
                        idletime = 0
                    else:
                        _self_terminate(asg_conn, instance_id)

                    _lock_host(scheduler_module, hostname, unlock=True)