Example #1
0
    def verify(_):
        def check_status():
            check_d = treq.head(
                append_segments(server_endpoint, 'servers', server_id),
                headers=headers(auth_token))
            check_d.addCallback(check_success, [404])
            return check_d

        start_time = clock.seconds()

        # this is treating all errors as transient, so the only error that can
        # occur is a CancelledError from timing out
        verify_d = retry_and_timeout(check_status, timeout,
                                     next_interval=repeating_interval(interval),
                                     clock=clock)

        def on_success(_):
            time_delete = clock.seconds() - start_time
            del_log.msg('Server deleted successfully: {time_delete} seconds.',
                        time_delete=time_delete)

        verify_d.addCallback(on_success)

        def on_timeout(_):
            time_delete = clock.seconds() - start_time
            del_log.err(None, timeout=timeout, time_delete=time_delete,
                        why=('Server {instance_id} failed to be deleted within '
                             'a {timeout} second timeout (it has been '
                             '{time_delete} seconds).'))

        verify_d.addErrback(on_timeout)
Example #2
0
    def wait_for_stack_list(self, expected_states, timeout=180, period=10):
        def check(content):
            states = pbag([s['stack_status'] for s in content['stacks']])
            if not (states == expected_states):
                msg("Waiting for group {} to reach desired group state.\n"
                    "{} (actual) {} (expected)".format(self.group.group_id,
                                                       states,
                                                       expected_states))
                raise TransientRetryError(
                    "Group states of {} did not match expected {})".format(
                        states, expected_states))

            msg("Success: desired group state reached:\n{}".format(
                expected_states))
            return self.rcs

        def poll():
            return self.get_stack_list().addCallback(check)

        expected_states = pbag(expected_states)

        return retry_and_timeout(
            poll,
            timeout,
            can_retry=terminal_errors_except(TransientRetryError),
            next_interval=repeating_interval(period),
            clock=reactor,
            deferred_description=(
                "Waiting for group {} to reach state {}".format(
                    self.group.group_id, str(expected_states))))
Example #3
0
    def wait_for_state(self, rcs, matcher, timeout=600, period=10, clock=None):
        """
        Wait for the state on the scaling group to match the provided matchers,
        specified by matcher.

        :param rcs: a :class:`otter.integration.lib.resources.TestResources`
            instance
        :param matcher: A :mod:`testtool.matcher`, as specified in
            module: testtools.matchers in
            http://testtools.readthedocs.org/en/latest/api.html.
        :param timeout: The amount of time to wait until this step is
            considered failed.
        :param period: How long to wait before polling again.
        :param clock: a :class:`twisted.internet.interfaces.IReactorTime`
            provider

        :return: None, if the state is reached
        :raises: :class:`TimedOutError` if the state is never reached within
            the requisite amount of time.

        Example usage:

        ```
        matcher = MatchesAll(
            IncludesServers(included_server_ids),
            ExcludesServers(exclude_server_ids),
            ContainsDict({
                'pending': Equals(0),
                'desired': Equals(5),
                'status': Equals('ACTIVE')
            })
        )

        ..wait_for_state(rcs, matchers, timeout=60)
        ```
        """
        def check(result):
            response, group_state = result
            mismatch = matcher.match(group_state['group'])
            if mismatch:
                msg("Waiting for group {} to reach desired group state.\n"
                    "Mismatch: {}"
                    .format(self.group_id, mismatch.describe()))
                raise TransientRetryError(mismatch.describe())
            msg("Success: desired group state reached:\n{}\nmatches:\n{}"
                .format(group_state['group'], matcher))
            return rcs

        def poll():
            return self.get_scaling_group_state(rcs, [200]).addCallback(check)

        return retry_and_timeout(
            poll, timeout,
            can_retry=terminal_errors_except(TransientRetryError),
            next_interval=repeating_interval(period),
            clock=clock or reactor,
            deferred_description=(
                "Waiting for group {} to reach state {}"
                .format(self.group_id, str(matcher)))
        )
Example #4
0
    def wait_for_stack_list(self, expected_states, timeout=180, period=10):
        def check(content):
            states = pbag([s['stack_status'] for s in content['stacks']])
            if not (states == expected_states):
                msg("Waiting for group {} to reach desired group state.\n"
                    "{} (actual) {} (expected)"
                    .format(self.group.group_id, states, expected_states))
                raise TransientRetryError(
                    "Group states of {} did not match expected {})"
                    .format(states, expected_states))

            msg("Success: desired group state reached:\n{}"
                .format(expected_states))
            return self.rcs

        def poll():
            return self.get_stack_list().addCallback(check)

        expected_states = pbag(expected_states)

        return retry_and_timeout(
            poll, timeout,
            can_retry=terminal_errors_except(TransientRetryError),
            next_interval=repeating_interval(period),
            clock=reactor,
            deferred_description=(
                "Waiting for group {} to reach state {}".format(
                    self.group.group_id, str(expected_states))))
    def verify(_):
        def check_status():
            check_d = treq.head(
                append_segments(server_endpoint, 'servers', server_id),
                headers=headers(auth_token))
            check_d.addCallback(check_success, [404])
            return check_d

        start_time = clock.seconds()

        timeout_description = (
            "Waiting for Nova to actually delete server {0}".format(server_id))

        verify_d = retry_and_timeout(check_status, timeout,
                                     next_interval=repeating_interval(interval),
                                     clock=clock,
                                     deferred_description=timeout_description)

        def on_success(_):
            time_delete = clock.seconds() - start_time
            del_log.msg('Server deleted successfully: {time_delete} seconds.',
                        time_delete=time_delete)

        verify_d.addCallback(on_success)
        verify_d.addErrback(del_log.err)
Example #6
0
    def wait_for_state(self, rcs, matcher, timeout=600, period=10, clock=None):
        """
        Wait for the state on the scaling group to match the provided matchers,
        specified by matcher.

        :param rcs: a :class:`otter.integration.lib.resources.TestResources`
            instance
        :param matcher: A :mod:`testtool.matcher`, as specified in
            module: testtools.matchers in
            http://testtools.readthedocs.org/en/latest/api.html.
        :param timeout: The amount of time to wait until this step is
            considered failed.
        :param period: How long to wait before polling again.
        :param clock: a :class:`twisted.internet.interfaces.IReactorTime`
            provider

        :return: None, if the state is reached
        :raises: :class:`TimedOutError` if the state is never reached within
            the requisite amount of time.

        Example usage:

        ```
        matcher = MatchesAll(
            IncludesServers(included_server_ids),
            ExcludesServers(exclude_server_ids),
            ContainsDict({
                'pending': Equals(0),
                'desired': Equals(5),
                'status': Equals('ACTIVE')
            })
        )

        ..wait_for_state(rcs, matchers, timeout=60)
        ```
        """
        def check(result):
            response, group_state = result
            mismatch = matcher.match(group_state['group'])
            if mismatch:
                msg("Waiting for group {} to reach desired group state.\n"
                    "Mismatch: {}".format(self.group_id, mismatch.describe()))
                raise TransientRetryError(mismatch.describe())
            msg("Success: desired group state reached:\n{}\nmatches:\n{}".
                format(group_state['group'], matcher))
            return rcs

        def poll():
            return self.get_scaling_group_state(rcs, [200]).addCallback(check)

        return retry_and_timeout(
            poll,
            timeout,
            can_retry=terminal_errors_except(TransientRetryError),
            next_interval=repeating_interval(period),
            clock=clock or reactor,
            deferred_description=(
                "Waiting for group {} to reach state {}".format(
                    self.group_id, str(matcher))))
Example #7
0
 def retrier(*args, **kwargs):
     return retry_and_timeout(
         partial(f, *args, **kwargs),
         timeout,
         can_retry=terminal_errors_except(TransientRetryError),
         next_interval=repeating_interval(period),
         clock=clock,
         deferred_description=reason)
Example #8
0
 def test_repeating_interval_always_returns_interval(self):
     """
     ``repeating_interval`` returns the same interval no matter what the
     failure
     """
     next_interval = repeating_interval(3)
     for exception in (DummyException(), NotImplementedError()):
         self.assertEqual(next_interval(Failure(exception)), 3)
Example #9
0
 def retrier(*args, **kwargs):
     return retry_and_timeout(
         partial(f, *args, **kwargs), timeout,
         can_retry=terminal_errors_except(TransientRetryError),
         next_interval=repeating_interval(period),
         clock=clock,
         deferred_description=reason
     )
Example #10
0
 def test_repeating_interval_always_returns_interval(self):
     """
     ``repeating_interval`` returns the same interval no matter what the
     failure
     """
     next_interval = repeating_interval(3)
     for exception in (DummyException(), NotImplementedError()):
         self.assertEqual(next_interval(Failure(exception)), 3)
Example #11
0
 def authenticate_tenant(self, tenant_id, log=None):
     """
     see :meth:`IAuthenticator.authenticate_tenant`
     """
     return retry(
         partial(self._authenticator.authenticate_tenant, tenant_id, log=log),
         can_retry=retry_times(self._max_retries),
         next_interval=repeating_interval(self._retry_interval),
         clock=self._reactor)
Example #12
0
 def authenticate_tenant(self, tenant_id, log=None):
     """
     see :meth:`IAuthenticator.authenticate_tenant`
     """
     return retry(partial(self._authenticator.authenticate_tenant,
                          tenant_id,
                          log=log),
                  can_retry=retry_times(self._max_retries),
                  next_interval=repeating_interval(self._retry_interval),
                  clock=self._reactor)
Example #13
0
def verified_delete(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=10,
                    timeout=3660,
                    clock=None):
    """
    Attempt to delete a server from the server endpoint, and ensure that it is
    deleted by trying again until deleting/getting the server results in a 404
    or until ``OS-EXT-STS:task_state`` in server details is 'deleting',
    indicating that Nova has acknowledged that the server is to be deleted
    as soon as possible.

    Time out attempting to verify deletes after a period of time and log an
    error.

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Deletion interval in seconds - how long until
        verifying a delete is retried. Default: 5.
    :param int timeout: Seconds after which the deletion will be logged as a
        failure, if Nova fails to return a 404.  Default is 3660, because if
        the server is building, the delete will not happen until immediately
        after it has finished building.

    :return: Deferred that fires when the expected status has been seen.
    """
    serv_log = log.bind(server_id=server_id)
    serv_log.msg('Deleting server')

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    timeout_description = (
        "Waiting for Nova to actually delete server {0} (or acknowledge delete)"
        .format(server_id))

    d = retry_and_timeout(
        partial(delete_and_verify, serv_log, server_endpoint, auth_token, server_id),
        timeout,
        next_interval=repeating_interval(interval),
        clock=clock,
        deferred_description=timeout_description)

    d.addCallback(log_with_time, clock, serv_log, clock.seconds(),
                  ('Server deleted successfully (or acknowledged by Nova as '
                   'to-be-deleted) : {time_delete} seconds.'), 'time_delete')
    d.addErrback(serv_log.err)
    return d
Example #14
0
def add_to_load_balancer(log, endpoint, auth_token, lb_config, ip_address, undo, clock=None):
    """
    Add an IP addressed to a load balancer based on the lb_config.

    TODO: Handle load balancer node metadata.

    :param log: A bound logger
    :param str endpoint: Load balancer endpoint URI.
    :param str auth_token: Keystone Auth Token.
    :param str lb_config: An lb_config dictionary.
    :param str ip_address: The IP Address of the node to add to the load
        balancer.
    :param IUndoStack undo: An IUndoStack to push any reversable operations onto.

    :return: Deferred that fires with the Add Node to load balancer response
        as a dict.
    """
    lb_id = lb_config['loadBalancerId']
    port = lb_config['port']
    path = append_segments(endpoint, 'loadbalancers', str(lb_id), 'nodes')
    lb_log = log.bind(loadbalancer_id=lb_id)

    def add():
        d = treq.post(path, headers=headers(auth_token),
                      data=json.dumps({"nodes": [{"address": ip_address,
                                                  "port": port,
                                                  "condition": "ENABLED",
                                                  "type": "PRIMARY"}]}),
                      log=lb_log)
        d.addCallback(check_success, [200, 202])
        d.addErrback(log_lb_unexpected_errors, path, lb_log, 'add_node')
        return d

    d = retry(
        add,
        can_retry=retry_times(config_value('worker.lb_max_retries') or LB_MAX_RETRIES),
        next_interval=repeating_interval(
            config_value('worker.lb_retry_interval') or LB_RETRY_INTERVAL),
        clock=clock)

    def when_done(result):
        lb_log.msg('Added to load balancer')
        undo.push(remove_from_load_balancer,
                  lb_log,
                  endpoint,
                  auth_token,
                  lb_id,
                  result['nodes'][0]['id'])
        return result

    return d.addCallback(treq.json_content).addCallback(when_done)
Example #15
0
def wait_for_servers(rcs,
                     pool,
                     matcher,
                     group=None,
                     timeout=600,
                     period=10,
                     clock=None,
                     _treq=treq):
    """
    Wait until Nova reaches a particular state (as described by the given
    matcher) - if a group is provided, then match only the servers for the
    given group.

    :param rcs: an instance of
        :class:`otter.integration.lib.resources.TestResources`
    :param pool: a :class:`twisted.web.client.HTTPConnectionPool`
    :param matcher: a :mod:`testtools.matcher` matcher that describes the
        desired state of the servers belonging to the autoscaling group.
    :param group: a :class:`otter.integration.lib.autoscale.ScalingGroup` that
        specifies which autoscaling group's servers we are looking at.  This
        group should already exist, and have a `group_id` attribute.  If not
        provided, the matcher will apply to all servers.
    """
    message = "Waiting for {0} Nova servers".format(
        "all" if group is None else "group {0} 's".format(group.group_id))

    @inlineCallbacks
    def do_work():
        servers = yield list_servers(rcs, pool, _treq=_treq)
        servers = servers['servers']
        if group is not None:
            servers = [
                server for server in servers
                if (group.group_id == server['metadata'].get(
                    "rax:autoscale:group:id", None))
            ]
        mismatch = matcher.match(servers)
        if mismatch:
            msg("{0}.\nMismatch: {1}".format(message, mismatch.describe()))
            raise TransientRetryError(mismatch.describe())
        returnValue(servers)

    return retry_and_timeout(
        do_work,
        timeout,
        can_retry=terminal_errors_except(TransientRetryError),
        next_interval=repeating_interval(period),
        clock=clock or reactor,
        deferred_description=("{0} to reach state {1}".format(
            message, str(matcher))))
Example #16
0
 def test_retry_sequence_fails_if_mismatch_sequence(self):
     """
     Fail if the wrong number of performers are given.
     """
     r = Retry(
         effect=Effect(1),
         should_retry=ShouldDelayAndRetry(
             can_retry=retry_times(5),
             next_interval=repeating_interval(10)))
     seq = [
         retry_sequence(r, [lambda _: raise_(Exception()),
                            lambda _: raise_(Exception())])
     ]
     self.assertRaises(AssertionError,
                       perform_sequence, seq, Effect(r))
Example #17
0
 def test_retry_sequence_retries_without_delays(self):
     """
     Perform the wrapped effect with the performers given,
     without any delay even if the original intent had a delay.
     """
     r = Retry(
         effect=Effect(1),
         should_retry=ShouldDelayAndRetry(
             can_retry=retry_times(5),
             next_interval=repeating_interval(10)))
     seq = [
         retry_sequence(r, [lambda _: raise_(Exception()),
                            lambda _: raise_(Exception()),
                            lambda _: "yay done"])
     ]
     self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
Example #18
0
    def test_do_not_have_to_expect_an_exact_can_retry(self):
        """
        The expected retry intent does not actually have to specify the
        exact ``can_retry`` function, since it might just be a lambda,
        which is hard to compare or hash.
        """
        expected = Retry(effect=Effect(1), should_retry=ANY)
        actual = Retry(effect=Effect(1), should_retry=ShouldDelayAndRetry(
            can_retry=lambda _: False,
            next_interval=repeating_interval(10)))

        seq = [
            retry_sequence(expected, [lambda _: raise_(Exception())])
        ]
        self.assertRaises(Exception,
                          perform_sequence, seq, Effect(actual))
Example #19
0
def wait_for_servers(rcs, pool, matcher, group=None, timeout=600, period=10,
                     clock=None, _treq=treq):
    """
    Wait until Nova reaches a particular state (as described by the given
    matcher) - if a group is provided, then match only the servers for the
    given group.

    :param rcs: an instance of
        :class:`otter.integration.lib.resources.TestResources`
    :param pool: a :class:`twisted.web.client.HTTPConnectionPool`
    :param matcher: a :mod:`testtools.matcher` matcher that describes the
        desired state of the servers belonging to the autoscaling group.
    :param group: a :class:`otter.integration.lib.autoscale.ScalingGroup` that
        specifies which autoscaling group's servers we are looking at.  This
        group should already exist, and have a `group_id` attribute.  If not
        provided, the matcher will apply to all servers.
    """
    message = "Waiting for {0} Nova servers".format(
        "all" if group is None else "group {0} 's".format(group.group_id))

    @inlineCallbacks
    def do_work():
        servers = yield list_servers(rcs, pool, _treq=_treq)
        servers = servers['servers']
        if group is not None:
            servers = [
                server for server in servers
                if (group.group_id ==
                    server['metadata'].get("rax:autoscale:group:id", None))
            ]
        mismatch = matcher.match(servers)
        if mismatch:
            msg("{0}.\nMismatch: {1}".format(message, mismatch.describe()))
            raise TransientRetryError(mismatch.describe())
        returnValue(servers)

    return retry_and_timeout(
        do_work, timeout,
        can_retry=terminal_errors_except(TransientRetryError),
        next_interval=repeating_interval(period),
        clock=clock or reactor,
        deferred_description=(
            "{0} to reach state {1}".format(message, str(matcher)))
    )
Example #20
0
    def test_fallback(self):
        """
        Accept a ``fallback`` dispatcher that will be used if a performer
        returns an effect for an intent that is not covered by the base
        dispatcher.
        """
        def dispatch_2(intent):
            if intent == 2:
                return sync_performer(lambda d, i: "yay done")

        r = Retry(
            effect=Effect(1),
            should_retry=ShouldDelayAndRetry(
                can_retry=retry_times(5),
                next_interval=repeating_interval(10)))

        seq = [
            retry_sequence(r, [lambda _: Effect(2)],
                           fallback_dispatcher=ComposedDispatcher(
                               [dispatch_2, base_dispatcher]))
        ]
        self.assertEqual(perform_sequence(seq, Effect(r)), "yay done")
Example #21
0
    def delete(self, rcs):
        """
        Delete the server.

        :param rcs: an instance of
            :class:`otter.integration.lib.resources.TestResources`
        """
        def try_delete():
            d = self.treq.delete(
                "{}/servers/{}".format(rcs.endpoints["nova"], self.id),
                headers=headers(str(rcs.token)),
                pool=self.pool)
            d.addCallback(check_success, [404], _treq=self.treq)
            d.addCallback(self.treq.content)
            return d

        return retry_and_timeout(
            try_delete, 120,
            can_retry=terminal_errors_except(APIError),
            next_interval=repeating_interval(5),
            clock=self.clock,
            deferred_description=(
                "Waiting for server {} to get deleted".format(self.id)))
Example #22
0
    def verify(_):
        def check_status():
            check_d = treq.head(append_segments(server_endpoint, 'servers',
                                                server_id),
                                headers=headers(auth_token))
            check_d.addCallback(check_success, [404])
            return check_d

        start_time = clock.seconds()

        # this is treating all errors as transient, so the only error that can
        # occur is a CancelledError from timing out
        verify_d = retry_and_timeout(
            check_status,
            timeout,
            next_interval=repeating_interval(interval),
            clock=clock)

        def on_success(_):
            time_delete = clock.seconds() - start_time
            del_log.msg('Server deleted successfully: {time_delete} seconds.',
                        time_delete=time_delete)

        verify_d.addCallback(on_success)

        def on_timeout(_):
            time_delete = clock.seconds() - start_time
            del_log.err(
                None,
                timeout=timeout,
                time_delete=time_delete,
                why=('Server {instance_id} failed to be deleted within '
                     'a {timeout} second timeout (it has been '
                     '{time_delete} seconds).'))

        verify_d.addErrback(on_timeout)
Example #23
0
    def delete(self, rcs):
        """
        Delete the server.

        :param rcs: an instance of
            :class:`otter.integration.lib.resources.TestResources`
        """
        def try_delete():
            d = self.treq.delete("{}/servers/{}".format(
                rcs.endpoints["nova"], self.id),
                                 headers=headers(str(rcs.token)),
                                 pool=self.pool)
            d.addCallback(check_success, [404], _treq=self.treq)
            d.addCallback(self.treq.content)
            return d

        return retry_and_timeout(
            try_delete,
            120,
            can_retry=terminal_errors_except(APIError),
            next_interval=repeating_interval(5),
            clock=self.clock,
            deferred_description=(
                "Waiting for server {} to get deleted".format(self.id)))
Example #24
0
def create_server(server_endpoint,
                  auth_token,
                  server_config,
                  log=None,
                  clock=None,
                  retries=3,
                  create_failure_delay=5,
                  _treq=None):
    """
    Create a new server.  If there is an error from Nova from this call,
    checks to see if the server was created anyway.  If not, will retry the
    create ``retries`` times (checking each time if a server).

    If the error from Nova is a 400, does not retry, because that implies that
    retrying will just result in another 400 (bad args).

    If checking to see if the server is created also results in a failure,
    does not retry because there might just be something wrong with Nova.

    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth Token.
    :param dict server_config: Nova server config.
    :param: int retries: Number of tries to retry the create.
    :param: int create_failure_delay: how much time in seconds to wait after
        a create server failure before checking Nova to see if a server
        was created

    :param log: logger
    :type log: :class:`otter.log.bound.BoundLog`

    :param _treq: To be used for testing - what treq object to use
    :type treq: something with the same api as :obj:`treq`

    :return: Deferred that fires with the CreateServer response as a dict.
    """
    path = append_segments(server_endpoint, 'servers')

    if _treq is None:  # pragma: no cover
        _treq = treq
    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    def _check_results(result, propagated_f):
        """
        Return the original failure, if checking a server resulted in a
        failure too.  Returns a wrapped propagated failure, if there were no
        servers created, so that the retry utility knows that server creation
        can be retried.
        """
        if isinstance(result, Failure):
            log.msg(
                "Attempt to find a created server in nova resulted in "
                "{failure}. Propagating the original create error instead.",
                failure=result)
            return propagated_f

        if result is None:
            raise _NoCreatedServerFound(propagated_f)

        return result

    def _check_server_created(f):
        """
        If creating a server failed with anything other than a 400, see if
        Nova created a server anyway (a 400 means that the server creation args
        were bad, and there is no point in retrying).

        If Nova created a server, just return it and pretend that the error
        never happened.  If it didn't, or if checking resulted in another
        failure response, return a failure of some type.
        """
        f.trap(APIError)
        if f.value.code == 400:
            return f

        d = deferLater(clock,
                       create_failure_delay,
                       find_server,
                       server_endpoint,
                       auth_token,
                       server_config,
                       log=log)
        d.addBoth(_check_results, f)
        return d

    def _create_with_delay(to_delay):
        d = _treq.post(path,
                       headers=headers(auth_token),
                       data=json.dumps({'server': server_config}),
                       log=log)
        if to_delay:
            # Add 1 second delay to space 1 second between server creations
            d.addCallback(delay, clock, 1)
        return d

    def _create_server():
        """
        Attempt to create a server, handling spurious non-400 errors from Nova
        by seeing if Nova created a server anyway in spite of the error.  If so
        then create server succeeded.

        If not, and if no further errors occur, server creation can be retried.
        """
        sem = get_sempahore("create_server", "worker.create_server_limit")
        if sem is not None:
            d = sem.run(_create_with_delay, True)
        else:
            d = _create_with_delay(False)
        d.addCallback(check_success, [202], _treq=_treq)
        d.addCallback(_treq.json_content)
        d.addErrback(_check_server_created)
        return d

    def _unwrap_NoCreatedServerFound(f):
        """
        The original failure was wrapped in a :class:`_NoCreatedServerFound`
        for ease of retry, but that should not be the final error propagated up
        by :func:`create_server`.

        This errback unwraps the :class:`_NoCreatedServerFound` error and
        returns the original failure.
        """
        f.trap(_NoCreatedServerFound)
        return f.value.original

    d = retry(_create_server,
              can_retry=compose_retries(
                  retry_times(retries),
                  terminal_errors_except(_NoCreatedServerFound)),
              next_interval=repeating_interval(15),
              clock=clock)

    d.addErrback(_unwrap_NoCreatedServerFound)
    d.addErrback(wrap_request_error, path, 'server_create')

    return d
Example #25
0
def wait_for_active(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=20,
                    timeout=7200,
                    clock=None):
    """
    Wait until the server specified by server_id's status is 'ACTIVE'

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Polling interval in seconds.  Default: 20.
    :param int timeout: timeout to poll for the server status in seconds.
        Default 7200 (2 hours).

    :return: Deferred that fires when the expected status has been seen.
    """
    log.msg("Checking instance status every {interval} seconds",
            interval=interval)

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    start_time = clock.seconds()

    def poll():
        def check_status(server):
            status = server['server']['status']
            time_building = clock.seconds() - start_time

            if status == 'ACTIVE':
                log.msg(("Server changed from 'BUILD' to 'ACTIVE' within "
                         "{time_building} seconds"),
                        time_building=time_building)
                return server

            elif status != 'BUILD':
                log.msg("Server changed to '{status}' in {time_building} seconds",
                        time_building=time_building, status=status)
                raise UnexpectedServerStatus(
                    server_id,
                    status,
                    'ACTIVE')

            else:
                raise TransientRetryError()  # just poll again

        sd = server_details(server_endpoint, auth_token, server_id, log=log)
        sd.addCallback(check_status)
        return sd

    timeout_description = ("Waiting for server <{0}> to change from BUILD "
                           "state to ACTIVE state").format(server_id)

    return retry_and_timeout(
        poll, timeout,
        can_retry=transient_errors_except(UnexpectedServerStatus, ServerDeleted),
        next_interval=repeating_interval(interval),
        clock=clock,
        deferred_description=timeout_description)
Example #26
0
def launch_server(log, request_bag, scaling_group, launch_config, undo,
                  clock=None):
    """
    Launch a new server given the launch config auth tokens and service
    catalog. Possibly adding the newly launched server to a load balancer.

    :param BoundLog log: A bound logger.
    :param request_bag: An object with a bunch of useful data on it, including
        a callable to re-auth and get a new token.
    :param IScalingGroup scaling_group: The scaling group to add the launched
        server to.
    :param dict launch_config: A launch_config args structure as defined for
        the launch_server_v1 type.
    :param IUndoStack undo: The stack that will be rewound if undo fails.

    :return: Deferred that fires with a 2-tuple of server details and the
        list of load balancer responses from add_to_load_balancers.
    """
    launch_config = prepare_launch_config(scaling_group.uuid, launch_config)

    cloudServersOpenStack = config_value('cloudServersOpenStack')
    server_endpoint = public_endpoint_url(request_bag.service_catalog,
                                          cloudServersOpenStack,
                                          request_bag.region)

    lb_config = launch_config.get('loadBalancers', [])
    server_config = launch_config['server']

    log = log.bind(server_name=server_config['name'])
    ilog = [None]

    def check_metadata(server):
        # sanity check to make sure the metadata didn't change - can probably
        # be removed after a while if we do not see any log messages from this
        # function
        expected = launch_config['server']['metadata']
        result = server['server'].get('metadata')
        if result != expected:
            ilog[0].msg('Server metadata has changed.',
                        sanity_check=True,
                        expected_metadata=expected,
                        nova_metadata=result)
        return server

    def wait_for_server(server, new_request_bag):
        server_id = server['server']['id']

        # NOTE: If server create is retried, each server delete will be pushed
        # to undo stack even after it will be deleted in check_error which is
        # fine since verified_delete succeeds on deleted server
        undo.push(
            verified_delete, log, server_endpoint, new_request_bag, server_id)

        ilog[0] = log.bind(server_id=server_id)
        return wait_for_active(
            ilog[0],
            server_endpoint,
            new_request_bag.auth_token,
            server_id).addCallback(check_metadata)

    def add_lb(server, new_request_bag):
        if lb_config:
            lbd = add_to_load_balancers(
                ilog[0], new_request_bag, lb_config, server, undo)
            lbd.addCallback(lambda lb_response: (server, lb_response))
            return lbd

        return (server, [])

    def _real_create_server(new_request_bag):
        auth_token = new_request_bag.auth_token
        d = create_server(server_endpoint, auth_token, server_config, log=log)
        d.addCallback(wait_for_server, new_request_bag)
        d.addCallback(add_lb, new_request_bag)
        return d

    def _create_server():
        return request_bag.re_auth().addCallback(_real_create_server)

    def check_error(f):
        f.trap(UnexpectedServerStatus)
        if f.value.status == 'ERROR':
            log.msg('{server_id} errored, deleting and creating new '
                    'server instead', server_id=f.value.server_id)
            # trigger server delete and return True to allow retry
            verified_delete(log, server_endpoint, request_bag,
                            f.value.server_id)
            return True
        else:
            return False

    d = retry(_create_server,
              can_retry=compose_retries(retry_times(3), check_error),
              next_interval=repeating_interval(15), clock=clock)

    return d
Example #27
0
def create_server(server_endpoint, auth_token, server_config, log=None,
                  clock=None, retries=3, create_failure_delay=5, _treq=None):
    """
    Create a new server.  If there is an error from Nova from this call,
    checks to see if the server was created anyway.  If not, will retry the
    create ``retries`` times (checking each time if a server).

    If the error from Nova is a 400, does not retry, because that implies that
    retrying will just result in another 400 (bad args).

    If checking to see if the server is created also results in a failure,
    does not retry because there might just be something wrong with Nova.

    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth Token.
    :param dict server_config: Nova server config.
    :param: int retries: Number of tries to retry the create.
    :param: int create_failure_delay: how much time in seconds to wait after
        a create server failure before checking Nova to see if a server
        was created

    :param log: logger
    :type log: :class:`otter.log.bound.BoundLog`

    :param _treq: To be used for testing - what treq object to use
    :type treq: something with the same api as :obj:`treq`

    :return: Deferred that fires with the CreateServer response as a dict.
    """
    path = append_segments(server_endpoint, 'servers')

    if _treq is None:  # pragma: no cover
        _treq = treq
    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    def _check_results(result, propagated_f):
        """
        Return the original failure, if checking a server resulted in a
        failure too.  Returns a wrapped propagated failure, if there were no
        servers created, so that the retry utility knows that server creation
        can be retried.
        """
        if isinstance(result, Failure):
            log.msg("Attempt to find a created server in nova resulted in "
                    "{failure}. Propagating the original create error instead.",
                    failure=result)
            return propagated_f

        if result is None:
            raise _NoCreatedServerFound(propagated_f)

        return result

    def _check_server_created(f):
        """
        If creating a server failed with anything other than a 400, see if
        Nova created a server anyway (a 400 means that the server creation args
        were bad, and there is no point in retrying).

        If Nova created a server, just return it and pretend that the error
        never happened.  If it didn't, or if checking resulted in another
        failure response, return a failure of some type.
        """
        f.trap(APIError)
        if f.value.code == 400:
            return f

        d = deferLater(clock, create_failure_delay, find_server,
                       server_endpoint, auth_token, server_config, log=log)
        d.addBoth(_check_results, f)
        return d

    def _create_with_delay(to_delay):
        d = _treq.post(path, headers=headers(auth_token),
                       data=json.dumps({'server': server_config}), log=log)
        if to_delay:
            # Add 1 second delay to space 1 second between server creations
            d.addCallback(delay, clock, 1)
        return d

    def _create_server():
        """
        Attempt to create a server, handling spurious non-400 errors from Nova
        by seeing if Nova created a server anyway in spite of the error.  If so
        then create server succeeded.

        If not, and if no further errors occur, server creation can be retried.
        """
        sem = get_sempahore("create_server", "worker.create_server_limit")
        if sem is not None:
            d = sem.run(_create_with_delay, True)
        else:
            d = _create_with_delay(False)
        d.addCallback(check_success, [202], _treq=_treq)
        d.addCallback(_treq.json_content)
        d.addErrback(_check_server_created)
        return d

    def _unwrap_NoCreatedServerFound(f):
        """
        The original failure was wrapped in a :class:`_NoCreatedServerFound`
        for ease of retry, but that should not be the final error propagated up
        by :func:`create_server`.

        This errback unwraps the :class:`_NoCreatedServerFound` error and
        returns the original failure.
        """
        f.trap(_NoCreatedServerFound)
        return f.value.original

    d = retry(
        _create_server,
        can_retry=compose_retries(
            retry_times(retries),
            terminal_errors_except(_NoCreatedServerFound)),
        next_interval=repeating_interval(15), clock=clock)

    d.addErrback(_unwrap_NoCreatedServerFound)
    d.addErrback(wrap_request_error, path, 'server_create')

    return d
Example #28
0
def launch_server(log, region, scaling_group, service_catalog, auth_token,
                  launch_config, undo, clock=None):
    """
    Launch a new server given the launch config auth tokens and service catalog.
    Possibly adding the newly launched server to a load balancer.

    :param BoundLog log: A bound logger.
    :param str region: A rackspace region as found in the service catalog.
    :param IScalingGroup scaling_group: The scaling group to add the launched
        server to.
    :param list service_catalog: A list of services as returned by the auth apis.
    :param str auth_token: The user's auth token.
    :param dict launch_config: A launch_config args structure as defined for
        the launch_server_v1 type.
    :param IUndoStack undo: The stack that will be rewound if undo fails.

    :return: Deferred that fires with a 2-tuple of server details and the
        list of load balancer responses from add_to_load_balancers.
    """
    launch_config = prepare_launch_config(scaling_group.uuid, launch_config)

    lb_region = config_value('regionOverrides.cloudLoadBalancers') or region
    cloudLoadBalancers = config_value('cloudLoadBalancers')
    cloudServersOpenStack = config_value('cloudServersOpenStack')

    lb_endpoint = public_endpoint_url(service_catalog,
                                      cloudLoadBalancers,
                                      lb_region)

    server_endpoint = public_endpoint_url(service_catalog,
                                          cloudServersOpenStack,
                                          region)

    lb_config = launch_config.get('loadBalancers', [])

    server_config = launch_config['server']

    log = log.bind(server_name=server_config['name'])
    ilog = [None]

    def wait_for_server(server):
        server_id = server['server']['id']

        # NOTE: If server create is retried, each server delete will be pushed
        # to undo stack even after it will be deleted in check_error which is fine
        # since verified_delete succeeds on deleted server
        undo.push(
            verified_delete, log, server_endpoint, auth_token, server_id)

        ilog[0] = log.bind(server_id=server_id)
        return wait_for_active(
            ilog[0],
            server_endpoint,
            auth_token,
            server_id)

    def add_lb(server):
        ip_address = private_ip_addresses(server)[0]
        lbd = add_to_load_balancers(
            ilog[0], lb_endpoint, auth_token, lb_config, ip_address, undo)
        lbd.addCallback(lambda lb_response: (server, lb_response))
        return lbd

    def _create_server():
        d = create_server(server_endpoint, auth_token, server_config, log=log)
        d.addCallback(wait_for_server)
        d.addCallback(add_lb)
        return d

    def check_error(f):
        f.trap(UnexpectedServerStatus)
        if f.value.status == 'ERROR':
            log.msg('{server_id} errored, deleting and creating new server instead',
                    server_id=f.value.server_id)
            # trigger server delete and return True to allow retry
            verified_delete(log, server_endpoint, auth_token, f.value.server_id)
            return True
        else:
            return False

    d = retry(_create_server, can_retry=compose_retries(retry_times(3), check_error),
              next_interval=repeating_interval(15), clock=clock)

    return d
Example #29
0
def verified_delete(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=10,
                    timeout=3660,
                    clock=None):
    """
    Attempt to delete a server from the server endpoint, and ensure that it is
    deleted by trying again until deleting the server results in a 404.

    Time out attempting to verify deletes after a period of time and log an
    error.

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Deletion interval in seconds - how long until
        verifying a delete is retried. Default: 5.
    :param int timeout: Seconds after which the deletion will be logged as a
        failure, if Nova fails to return a 404.  Default is 3660, because if
        the server is building, the delete will not happen until immediately
        after it has finished building.

    :return: Deferred that fires when the expected status has been seen.
    """
    serv_log = log.bind(server_id=server_id)
    serv_log.msg('Deleting server')

    path = append_segments(server_endpoint, 'servers', server_id)

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    # just delete over and over until a 404 is received
    def delete():
        del_d = treq.delete(path, headers=headers(auth_token), log=serv_log)
        del_d.addCallback(check_success, [404])
        del_d.addCallback(treq.content)
        return del_d

    start_time = clock.seconds()

    timeout_description = (
        "Waiting for Nova to actually delete server {0}".format(server_id))

    d = retry_and_timeout(delete, timeout,
                          next_interval=repeating_interval(interval),
                          clock=clock,
                          deferred_description=timeout_description)

    def on_success(_):
        time_delete = clock.seconds() - start_time
        serv_log.msg('Server deleted successfully: {time_delete} seconds.',
                     time_delete=time_delete)

    d.addCallback(on_success)
    d.addErrback(serv_log.err)
    return d
Example #30
0
def launch_server(log,
                  request_bag,
                  scaling_group,
                  launch_config,
                  undo,
                  clock=None):
    """
    Launch a new server given the launch config auth tokens and service
    catalog. Possibly adding the newly launched server to a load balancer.

    :param BoundLog log: A bound logger.
    :param request_bag: An object with a bunch of useful data on it, including
        a callable to re-auth and get a new token.
    :param IScalingGroup scaling_group: The scaling group to add the launched
        server to.
    :param dict launch_config: A launch_config args structure as defined for
        the launch_server_v1 type.
    :param IUndoStack undo: The stack that will be rewound if undo fails.

    :return: Deferred that fires with a 2-tuple of server details and the
        list of load balancer responses from add_to_load_balancers.
    """
    launch_config = prepare_launch_config(scaling_group.uuid, launch_config)

    cloudServersOpenStack = config_value('cloudServersOpenStack')
    server_endpoint = public_endpoint_url(request_bag.service_catalog,
                                          cloudServersOpenStack,
                                          request_bag.region)

    lb_config = launch_config.get('loadBalancers', [])
    server_config = launch_config['server']

    log = log.bind(server_name=server_config['name'])
    ilog = [None]

    def check_metadata(server):
        # sanity check to make sure the metadata didn't change - can probably
        # be removed after a while if we do not see any log messages from this
        # function
        expected = launch_config['server']['metadata']
        result = server['server'].get('metadata')
        if result != expected:
            ilog[0].msg('Server metadata has changed.',
                        sanity_check=True,
                        expected_metadata=expected,
                        nova_metadata=result)
        return server

    def wait_for_server(server, new_request_bag):
        server_id = server['server']['id']

        # NOTE: If server create is retried, each server delete will be pushed
        # to undo stack even after it will be deleted in check_error which is
        # fine since verified_delete succeeds on deleted server
        undo.push(verified_delete, log, server_endpoint, new_request_bag,
                  server_id)

        ilog[0] = log.bind(server_id=server_id)
        return wait_for_active(ilog[0], server_endpoint,
                               new_request_bag.auth_token,
                               server_id).addCallback(check_metadata)

    def add_lb(server, new_request_bag):
        if lb_config:
            lbd = add_to_load_balancers(ilog[0], new_request_bag, lb_config,
                                        server, undo)
            lbd.addCallback(lambda lb_response: (server, lb_response))
            return lbd

        return (server, [])

    def _real_create_server(new_request_bag):
        auth_token = new_request_bag.auth_token
        d = create_server(server_endpoint, auth_token, server_config, log=log)
        d.addCallback(wait_for_server, new_request_bag)
        d.addCallback(add_lb, new_request_bag)
        return d

    def _create_server():
        return request_bag.re_auth().addCallback(_real_create_server)

    def check_error(f):
        f.trap(UnexpectedServerStatus)
        if f.value.status == 'ERROR':
            log.msg(
                '{server_id} errored, deleting and creating new '
                'server instead',
                server_id=f.value.server_id)
            # trigger server delete and return True to allow retry
            verified_delete(log, server_endpoint, request_bag,
                            f.value.server_id)
            return True
        else:
            return False

    d = retry(_create_server,
              can_retry=compose_retries(retry_times(3), check_error),
              next_interval=repeating_interval(15),
              clock=clock)

    return d
Example #31
0
def wait_for_active(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=5,
                    timeout=3600,
                    clock=None):
    """
    Wait until the server specified by server_id's status is 'ACTIVE'

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Polling interval in seconds.  Default: 5.
    :param int timeout: timeout to poll for the server status in seconds.
        Default 3600 (1 hour)

    :return: Deferred that fires when the expected status has been seen.
    """
    log.msg("Checking instance status every {interval} seconds",
            interval=interval)

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    start_time = clock.seconds()

    def poll():
        def check_status(server):
            status = server['server']['status']

            if status == 'ACTIVE':
                time_building = clock.seconds() - start_time
                log.msg(("Server changed from 'BUILD' to 'ACTIVE' within "
                         "{time_building} seconds"),
                        time_building=time_building)
                return server

            elif status != 'BUILD':
                raise UnexpectedServerStatus(server_id, status, 'ACTIVE')

            else:
                raise TransientRetryError()  # just poll again

        sd = server_details(server_endpoint, auth_token, server_id)
        sd.addCallback(check_status)
        return sd

    d = retry_and_timeout(
        poll,
        timeout,
        can_retry=transient_errors_except(UnexpectedServerStatus),
        next_interval=repeating_interval(interval),
        clock=clock)

    def on_error(f):
        if f.check(CancelledError):
            time_building = clock.seconds() - start_time
            log.msg(
                ('Server {instance_id} failed to change from BUILD state '
                 'to ACTIVE within a {timeout} second timeout (it has been '
                 '{time_building} seconds).'),
                timeout=timeout,
                time_building=time_building)
        return f

    d.addErrback(on_error)

    return d
Example #32
0
def wait_for_active(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=20,
                    timeout=7200,
                    clock=None):
    """
    Wait until the server specified by server_id's status is 'ACTIVE'

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Polling interval in seconds.  Default: 20.
    :param int timeout: timeout to poll for the server status in seconds.
        Default 7200 (2 hours).

    :return: Deferred that fires when the expected status has been seen.
    """
    log.msg("Checking instance status every {interval} seconds",
            interval=interval)

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    start_time = clock.seconds()

    def poll():
        def check_status(server):
            status = server['server']['status']
            time_building = clock.seconds() - start_time

            if status == 'ACTIVE':
                log.msg(("Server changed from 'BUILD' to 'ACTIVE' within "
                         "{time_building} seconds"),
                        time_building=time_building)
                return server

            elif status != 'BUILD':
                log.msg(
                    "Server changed to '{status}' in {time_building} seconds",
                    time_building=time_building,
                    status=status)
                raise UnexpectedServerStatus(server_id, status, 'ACTIVE')

            else:
                raise TransientRetryError()  # just poll again

        sd = server_details(server_endpoint, auth_token, server_id, log=log)
        sd.addCallback(check_status)
        return sd

    timeout_description = ("Waiting for server <{0}> to change from BUILD "
                           "state to ACTIVE state").format(server_id)

    return retry_and_timeout(poll,
                             timeout,
                             can_retry=transient_errors_except(
                                 UnexpectedServerStatus, ServerDeleted),
                             next_interval=repeating_interval(interval),
                             clock=clock,
                             deferred_description=timeout_description)
Example #33
0
def wait_for_active(log,
                    server_endpoint,
                    auth_token,
                    server_id,
                    interval=5,
                    timeout=3600,
                    clock=None):
    """
    Wait until the server specified by server_id's status is 'ACTIVE'

    :param log: A bound logger.
    :param str server_endpoint: Server endpoint URI.
    :param str auth_token: Keystone Auth token.
    :param str server_id: Opaque nova server id.
    :param int interval: Polling interval in seconds.  Default: 5.
    :param int timeout: timeout to poll for the server status in seconds.
        Default 3600 (1 hour)

    :return: Deferred that fires when the expected status has been seen.
    """
    log.msg("Checking instance status every {interval} seconds",
            interval=interval)

    if clock is None:  # pragma: no cover
        from twisted.internet import reactor
        clock = reactor

    start_time = clock.seconds()

    def poll():
        def check_status(server):
            status = server['server']['status']

            if status == 'ACTIVE':
                time_building = clock.seconds() - start_time
                log.msg(("Server changed from 'BUILD' to 'ACTIVE' within "
                         "{time_building} seconds"),
                        time_building=time_building)
                return server

            elif status != 'BUILD':
                raise UnexpectedServerStatus(
                    server_id,
                    status,
                    'ACTIVE')

            else:
                raise TransientRetryError()  # just poll again

        sd = server_details(server_endpoint, auth_token, server_id)
        sd.addCallback(check_status)
        return sd

    d = retry_and_timeout(
        poll, timeout,
        can_retry=transient_errors_except(UnexpectedServerStatus),
        next_interval=repeating_interval(interval),
        clock=clock)

    def on_error(f):
        if f.check(CancelledError):
            time_building = clock.seconds() - start_time
            log.msg(('Server {instance_id} failed to change from BUILD state '
                     'to ACTIVE within a {timeout} second timeout (it has been '
                     '{time_building} seconds).'),
                    timeout=timeout, time_building=time_building)
        return f

    d.addErrback(on_error)

    return d