Exemple #1
0
def _terminate_if_down(scheduler_module, config, asg_name, instance_id,
                       max_wait):
    """Check that node is correctly attached to scheduler otherwise terminate the instance."""
    asg_client = boto3.client("autoscaling",
                              region_name=config.region,
                              config=config.proxy_config)

    @retry(wait_fixed=seconds(10),
           retry_on_result=lambda result: result is True,
           stop_max_delay=max_wait)
    def _poll_wait_for_node_ready():
        is_down = scheduler_module.is_node_down()
        if is_down:
            log.warning("Node reported as down")
        return is_down

    try:
        _poll_wait_for_node_ready()
    except RetryError:
        log.error(
            "Node is marked as down by scheduler or not attached correctly. Terminating..."
        )
        _dump_logs(instance_id)
        # jobwatcher already has the logic to request a new host in case of down nodes,
        # which is done in order to speed up cluster recovery.
        _self_terminate(
            asg_client,
            instance_id,
            decrement_desired=not _maintain_size(asg_name, asg_client))
Exemple #2
0
    def run_remote_command(self,
                           command,
                           timeout=seconds(5),
                           log_error=True,
                           fail_on_error=True):
        """
        Execute remote command on the configured host.

        :param command: command to execute.
        :param log_error: log errors.
        :return: result of the execution.
        """
        if isinstance(command, list):
            command = " ".join(command)
        logging.info("Executing remote command command on {0}: {1}".format(
            self.__user_at_hostname, command))
        result = None
        try:
            stdin, stdout, stderr = self.__ssh_client.exec_command(
                command, get_pty=True)
            self._wait_for_command_execution(timeout, stdout)
            result = RemoteCommandResult(
                return_code=stdout.channel.recv_exit_status(),
                stdout="\n".join(stdout.read().decode().splitlines()),
                stderr="\n".join(stderr.read().decode().splitlines()),
            )
            if result.return_code != 0 and fail_on_error:
                raise RemoteCommandExecutionError(result)
            return result
        except Exception:
            if log_error and result:
                logging.error(
                    "Command {0} failed with error:\n{1}\nand output:\n{2}".
                    format(command, result.stderr, result.stdout))
            raise
Exemple #3
0
        messages = _retrieve_all_sqs_messages(queue)
        update_events = _parse_sqs_messages(messages, table)
        _process_sqs_messages(
            update_events,
            scheduler_module,
            sqs_config,
            table,
            queue,
            max_cluster_size,
            instance_properties,
            force_cluster_update,
        )
        time.sleep(LOOP_TIME)


@retry(wait_fixed=seconds(LOOP_TIME))
def main():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s [%(module)s:%(funcName)s] %(message)s"
    )
    log.info("sqswatcher startup")

    try:
        config = _get_config()
        queue = _get_sqs_queue(config.region, config.sqsqueue,
                               config.proxy_config)
        table = _get_ddb_table(config.region, config.table_name,
                               config.proxy_config)
        asg_name = get_asg_name(config.stack_name, config.region,
                                config.proxy_config)
Exemple #4
0
        _poll_wait_for_node_ready()
    except RetryError:
        log.error(
            "Node is marked as down by scheduler or not attached correctly. Terminating..."
        )
        _dump_logs(instance_id)
        # jobwatcher already has the logic to request a new host in case of down nodes,
        # which is done in order to speed up cluster recovery.
        _self_terminate(
            asg_client,
            instance_id,
            decrement_desired=not _maintain_size(asg_name, asg_client))


@retry(
    wait_exponential_multiplier=seconds(1),
    wait_exponential_max=seconds(10),
    retry_on_result=lambda result: result is False,
    stop_max_delay=minutes(10),
)
def _wait_for_stack_ready(stack_name, region, proxy_config):
    """
    Verify if the Stack is in one of the *_COMPLETE states.

    :param stack_name: Stack to query for
    :param region: AWS region
    :param proxy_config: Proxy configuration
    :return: true if the stack is in the *_COMPLETE status
    """
    log.info("Waiting for stack %s to be ready", stack_name)
    cfn_client = boto3.client("cloudformation",
Exemple #5
0
            "Values": [stack_name]
        }])
        asg_name = response.get("Tags")[0].get("ResourceId")
        log.info("ASG %s found for the stack %s", asg_name, stack_name)
        return asg_name
    except IndexError:
        log.warning("Unable to get ASG for stack %s", stack_name)
        raise
    except Exception as e:
        raise CriticalError(
            "Unable to get ASG for stack {0}. Failed with exception: {1}".
            format(stack_name, e))


@retry(stop_max_attempt_number=5,
       wait_exponential_multiplier=seconds(0.5),
       wait_exponential_max=seconds(10))
def get_asg_settings(region, proxy_config, asg_name):
    try:
        asg_client = boto3.client("autoscaling",
                                  region_name=region,
                                  config=proxy_config)
        asg = asg_client.describe_auto_scaling_groups(
            AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0]
        min_size = asg.get("MinSize")
        desired_capacity = asg.get("DesiredCapacity")
        max_size = asg.get("MaxSize")

        log.info("ASG min/desired/max: %d/%d/%d" %
                 (min_size, desired_capacity, max_size))
        return min_size, desired_capacity, max_size
        _poll_wait_for_node_ready()
    except RetryError:
        log.error(
            "Node is marked as down by scheduler or not attached correctly. Terminating..."
        )
        _dump_logs(instance_id)
        # decrement asg desired only if not reached the min.
        # jobwatcher already has the logic to request a new host in case of down nodes,
        # which is done in order to speed up cluster recovery.
        _self_terminate(
            asg_client,
            instance_id,
            decrement_desired=not _maintain_size(asg_name, asg_client))


@retry(wait_fixed=seconds(10),
       retry_on_result=lambda result: result is False,
       stop_max_delay=minutes(10))
def _wait_for_stack_ready(stack_name, region, proxy_config):
    """
    Verify if the Stack is in one of the *_COMPLETE states.

    :param stack_name: Stack to query for
    :param region: AWS region
    :param proxy_config: Proxy configuration
    :return: true if the stack is in the *_COMPLETE status
    """
    log.info("Waiting for stack %s to be ready", stack_name)
    cfn_client = boto3.client("cloudformation",
                              region_name=region,
                              config=proxy_config)