Example #1
0
def converge(desired_state, servers_with_cheese, load_balancer_contents, now,
             timeout=3600):
    """
    Create a :obj:`Convergence` that indicates how to transition from the state
    provided by the given parameters to the :obj:`DesiredGroupState` described
    by ``desired_state``.

    :param DesiredGroupState desired_state: The desired group state.
    :param list servers_with_cheese: a list of of :obj:`NovaServer` instances.
        This must only contain servers that are being managed for the specified
        group.
    :param dict load_balancer_contents: a dictionary mapping load balancer IDs
        to lists of 2-tuples of (IP address, loadbalancer node ID).
    :param float now: number of seconds since the POSIX epoch indicating the
        time at which the convergence was requested.
    :param float timeout: Number of seconds after which we will delete a server
        in BUILD.

    :rtype: obj:`Convergence`
    """
    newest_to_oldest = sorted(servers_with_cheese, key=lambda s: -s.created)
    servers_in_error, servers_in_active, servers_in_build = partition_groups(
        lambda s: s.state, newest_to_oldest, [ERROR, ACTIVE, BUILD])

    building_too_long, waiting_for_build = partition_bool(
        lambda server: now - server.created >= timeout,
        servers_in_build)

    create_server = CreateServer(launch_config=desired_state.launch_config)

    # delete any servers that have been building for too long
    delete_timeout_steps = [DeleteServer(server_id=server.id)
                            for server in building_too_long]

    # create servers
    create_steps = [create_server] * (desired_state.desired
                                      - (len(servers_in_active)
                                         + len(waiting_for_build)))

    # delete over capacity, starting with building, then active,
    # preferring older
    servers_to_delete = (servers_in_active + waiting_for_build)[desired_state.desired:]
    delete_steps = [DeleteServer(server_id=server.id)
                    for server in servers_to_delete]

    # delete all servers in error.
    delete_error_steps = [DeleteServer(server_id=server.id)
                          for server in servers_in_error]

    return Convergence(
        steps=pbag(create_steps
                   + delete_steps
                   + delete_error_steps
                   + delete_timeout_steps
                   ))
Example #2
0
def _remove_from_lb_with_draining(timeout, nodes, now):
    """
    Produce a series of steps that will eventually remove all the given nodes.
    It does this in three steps:

    For any particular node in ``nodes``:

    1. If the timeout is greater than zero, and the node is ``ENABLED``, the
       node will be changed to ``DRAINING``.

    2. If the node is ``DRAINING``, and the timeout (greater than zero) has
       already expired or there are no more active connections, the node will
       be removed from the load balancer.  If the timeout (greater than zero)
       has not expired and active connections != 0, then nothing is done to the
       node.

    3. If the node is in any other state other than `DRAINING` or `ENABLED`, or
       if the timeout is zero, it will be removed from the load balancer.

    :param float timeout: the time the node should remain in draining until
        removed
    :param list nodes: `list` of :obj:`CLBNode` that should be
        drained, then removed
    :param float now: number of seconds since the POSIX epoch indicating the
        time at which the convergence was requested.

    :rtype: `list` of :class:`IStep`
    """
    to_drain = ()
    in_drain = ()

    # only put nodes into draining if a timeout is specified
    if timeout > 0:
        draining, to_drain = partition_bool(
            lambda node: node.currently_draining(),
            [node for node in nodes
             if IDrainable.providedBy(node) and node.is_active()])

        # Nothing should be done to these, because the timeout has not expired
        # and the nodes are still active
        in_drain = [node for node in draining
                    if not node.is_done_draining(now, timeout)]

    removes = [remove_node_from_lb(node=node)
               for node in (set(nodes) - set(to_drain) - set(in_drain))]

    changes = [drain_lb_node(node=node) for node in to_drain]

    retry = (
        [ConvergeLater(reasons=[ErrorReason.String('draining servers')])]
        if in_drain else [])

    return removes + changes + retry
Example #3
0
def unchanged_divergent_groups(clock, current, timeout, group_metrics):
    """
    Return list of GroupMetrics that have been divergent and unchanged for
    timeout seconds

    :param IReactorTime clock: Twisted time used to track
    :param dict current: Currently tracked divergent groups
    :param float timeout: Timeout in seconds
    :param list group_metrics: List of group metrics

    :return: (updated current, List of (group, divergent_time) tuples)
    """
    converged, diverged = partition_bool(
        lambda gm: gm.actual + gm.pending == gm.desired, group_metrics)
    # stop tracking all converged and deleted groups
    deleted = set(current.keys()) - metrics_set(group_metrics)
    updated = current.copy()
    for g in metrics_set(converged) | deleted:
        updated.pop(g, None)
    # Start tracking divergent groups depending on whether they've changed
    now = clock.seconds()
    to_log, new = [], {}
    for gm in diverged:
        pair = (gm.tenant_id, gm.group_id)
        if pair in updated:
            last_time, values = updated[pair]
            if values != hash((gm.desired, gm.actual, gm.pending)):
                del updated[pair]
                continue
            time_diff = now - last_time
            if time_diff > timeout and time_diff % timeout <= 60:
                # log on intervals of timeout. For example, if timeout is 1 hr
                # then log every hour it remains diverged
                to_log.append((gm, time_diff))
        else:
            new[pair] = now, hash((gm.desired, gm.actual, gm.pending))
    return merge(updated, new), to_log
Example #4
0
def unchanged_divergent_groups(clock, current, timeout, group_metrics):
    """
    Return list of GroupMetrics that have been divergent and unchanged for
    timeout seconds

    :param IReactorTime clock: Twisted time used to track
    :param dict current: Currently tracked divergent groups
    :param float timeout: Timeout in seconds
    :param list group_metrics: List of group metrics

    :return: (updated current, List of (group, divergent_time) tuples)
    """
    converged, diverged = partition_bool(
        lambda gm: gm.actual + gm.pending == gm.desired, group_metrics)
    # stop tracking all converged and deleted groups
    deleted = set(current.keys()) - metrics_set(group_metrics)
    updated = current.copy()
    for g in metrics_set(converged) | deleted:
        updated.pop(g, None)
    # Start tracking divergent groups depending on whether they've changed
    now = clock.seconds()
    to_log, new = [], {}
    for gm in diverged:
        pair = (gm.tenant_id, gm.group_id)
        if pair in updated:
            last_time, values = updated[pair]
            if values != hash((gm.desired, gm.actual, gm.pending)):
                del updated[pair]
                continue
            time_diff = now - last_time
            if time_diff > timeout and time_diff % timeout <= 60:
                # log on intervals of timeout. For example, if timeout is 1 hr
                # then log every hour it remains diverged
                to_log.append((gm, time_diff))
        else:
            new[pair] = now, hash((gm.desired, gm.actual, gm.pending))
    return merge(updated, new), to_log
Example #5
0
def converge_launch_server(desired_state, servers_with_cheese,
                           load_balancer_nodes, load_balancers,
                           now, timeout=3600):
    """
    Create steps that indicate how to transition from the state provided
    by the given parameters to the :obj:`DesiredServerGroupState` described by
    ``desired_state``.

    :param DesiredServerGroupState desired_state: The desired group state.
    :param set servers_with_cheese: a list of :obj:`NovaServer` instances.
        This must only contain servers that are being managed for the specified
        group.
    :param load_balancer_nodes: a set of :obj:`ILBNode` providers. This
        must contain all the load balancer mappings for all the load balancers
        (of all types) on the tenant.
    :param dict load_balancers: Collection of load balancer objects accessed
        based on its ID. The object is opaque and is not used by planner
        directly. It is intended to contain extra info for specific LB provider
    :param float now: number of seconds since the POSIX epoch indicating the
        time at which the convergence was requested.
    :param float timeout: Number of seconds after which we will delete a server
        in BUILD.
    :rtype: :obj:`pbag` of `IStep`

    """
    newest_to_oldest = sorted(servers_with_cheese, key=lambda s: -s.created)

    servers = defaultdict(lambda: [], groupby(get_destiny, newest_to_oldest))
    servers_in_active = servers[Destiny.CONSIDER_AVAILABLE]

    building_too_long, waiting_for_build = partition_bool(
        lambda server: now - server.created >= timeout,
        servers[Destiny.WAIT_WITH_TIMEOUT])

    create_server = CreateServer(server_config=desired_state.server_config)

    # delete any servers that have been building for too long
    delete_timeout_steps = [DeleteServer(server_id=server.id)
                            for server in building_too_long]

    # create servers
    create_steps = [create_server] * (
        desired_state.capacity - (
            len(servers_in_active) +
            len(waiting_for_build) +
            len(servers[Destiny.WAIT]) +
            len(servers[Destiny.AVOID_REPLACING])))

    # Scale down over capacity, starting with building, then WAIT, then
    # AVOID_REPLACING, then active, preferring older.  Also, finish
    # draining/deleting servers already in draining state
    servers_in_preferred_order = (
        servers_in_active +
        servers[Destiny.AVOID_REPLACING] +
        servers[Destiny.WAIT] +
        waiting_for_build)
    servers_to_delete = servers_in_preferred_order[desired_state.capacity:]

    def drain_and_delete_a_server(server):
        return _drain_and_delete(
            server,
            desired_state.draining_timeout,
            [node for node in load_balancer_nodes if node.matches(server)],
            now)

    try:
        scale_down_steps = list(
                mapcat(drain_and_delete_a_server,
                       servers_to_delete + servers[Destiny.DRAIN]))
    except DrainingUnavailable as de:
        return pbag([fail_convergence(de)])

    # delete all servers in error - draining does not need to be
    # handled because servers in error presumably are not serving
    # traffic anyway
    delete_error_steps = [DeleteServer(server_id=server.id)
                          for server in servers[Destiny.DELETE]]

    # clean up all the load balancers from deleted and errored servers
    cleanup_errored_and_deleted_steps = [
        remove_node_from_lb(lb_node)
        for server in servers[Destiny.DELETE] + servers[Destiny.CLEANUP]
        for lb_node in load_balancer_nodes if lb_node.matches(server)]

    # converge all the servers that remain to their desired load balancer state
    still_active_servers = filter(lambda s: s not in servers_to_delete,
                                  servers_in_active)
    try:
        lb_converge_steps = [
            step
            for server in still_active_servers
            for step in _converge_lb_state(
                server,
                [node for node in load_balancer_nodes if node.matches(server)],
                load_balancers,
                now,
                # Temporarily using build timeout as node offline timeout.
                # See https://github.com/rackerlabs/otter/issues/1905
                timeout)
            ]
    except DrainingUnavailable as de:
        return pbag([fail_convergence(de)])

    # Converge again if we expect state transitions on any servers
    converge_later = []
    if any((s not in servers_to_delete
            for s in waiting_for_build)):
        converge_later = [
            ConvergeLater(reasons=[ErrorReason.String('waiting for servers')])]

    unavail_fmt = ('Waiting for server {server_id} to transition to ACTIVE '
                   'from {status}')
    reasons = [ErrorReason.UserMessage(unavail_fmt.format(server_id=s.id,
                                                          status=s.state.name))
               for s in servers[Destiny.WAIT] if s not in servers_to_delete]
    if reasons:
        converge_later.append(ConvergeLater(limited=True, reasons=reasons))

    return pbag(create_steps +
                scale_down_steps +
                delete_error_steps +
                cleanup_errored_and_deleted_steps +
                delete_timeout_steps +
                lb_converge_steps +
                converge_later)