def test_acquire_timeout(self): """ acquire_eff creates child node and keeps checking if it is smallest and eventually gives up by raising `LockTimeout`. It deletes child node before returning. """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000001")), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0)), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0.12)), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0.4)), (DeleteNode(path="/testlock/prefix0000000001", version=-1), noop)] self.assertRaises(LockTimeout, perform_sequence, seq, self.lock.acquire_eff(True, 0.3))
def provision_digitalocean(node, token, package_source, distribution, variants): """ Provision Flocker on this node. :param LibcloudNode node: The node to be provisioned. :param PackageSource package_source: The URL of the distribution package repository. :param bytes distribution: The label of the distribution to be installed on the node. :param bytes token: A DigitalOcean v2 API token. :param set variants: The set of variant configurations to use when provisioning """ # DO doesn't support booting the droplet's own kernel. # * http://digitalocean.uservoice.com/forums/136585-digitalocean/suggestions/2814988-give-option-to-use-the-droplet-s-own-bootloader # noqa # So rather than upgrade, we'll need to have new task to install the kernel # package (and headers) for the DO supported kernel. # The Fedora droplet default is to use a kernel that's too old for our # purposes. # Our documentation describes how to select a newer (DO supported) kernel # for this droplet. # Unfortunately this operation is only supported in the DO v2 API. # * http://digitalocean.uservoice.com/forums/136585-digitalocean/suggestions/5618546-add-the-ability-to-change-kernel-via-api # noqa # * https://developers.digitalocean.com/#change-the-kernel # But libcloud only supports the DO v1 API # * https://www.digitalocean.com/community/questions/does-libcloud-work-with-digitalocean-s-v2-api # noqa # * https://issues.apache.org/jira/browse/JCLOUDS-613 return sequence([ # Change the configured kernel Effect( Func(lambda: change_kernel(node._node.id, token, DIGITALOCEAN_KERNEL))), # Install the corresponding kernel package. run_remotely(username='******', address=node.address, commands=task_install_digitalocean_kernel()), # Hard reboot the machine to boot into the new kernel. Effect(Func(lambda: hard_reboot(node._node.id, token))), # Finally run all the standard Fedora20 installation steps. run_remotely( username='******', address=node.address, commands=sequence([ provision( package_source=package_source, distribution=node.distribution, variants=variants, ), # https://clusterhq.atlassian.net/browse/FLOC-1550 # This should be part of ._install.configure_cluster task_open_control_firewall() ]), ), ])
def _acquire_loop(self, blocking, timeout): acquired = yield self.is_acquired_eff() if acquired or not blocking: yield do_return(acquired) start = yield Effect(Func(time.time)) while True: yield Effect(Delay(self._interval)) if (yield self.is_acquired_eff()): yield do_return(True) if timeout is not None: now = yield Effect(Func(time.time)) if now - start > timeout: raise LockTimeout( "Failed to acquire lock on {} in {} seconds".format( self.path, now - start))
def _with_conv_runid(self, eff): """ Return Effect wrapped with converger_run_id log field """ return Effect(Func(uuid.uuid4)).on(str).on( lambda uid: with_log(eff, otter_service='converger', converger_run_id=uid))
def test_no_exponential_backoff(self): """ If ``False`` is passed for the ``backoff`` parameter, the effect is always retried with the same delay. """ divisors = [0, 0, 0, 1] def tester(): x = divisors.pop(0) return 1 / x seq = [ (Delay(5), lambda ignore: None), (Delay(5), lambda ignore: None), (Delay(5), lambda ignore: None), ] retrier = retry_effect_with_timeout( Effect(Func(tester)), timeout=1, retry_wait=timedelta(seconds=5), backoff=False, ) result = perform_sequence(seq, retrier) self.assertEqual(result, 1)
def test_timeout(self): """ If the timeout expires, the retry effect fails with the exception from the final time the wrapped effect is performed. """ expected_intents = [ (Delay(1), lambda ignore: None), (Delay(2), lambda ignore: None), ] exceptions = [ Exception("Wrong (1)"), Exception("Wrong (2)"), CustomException(), ] def tester(): raise exceptions.pop(0) retrier = retry_effect_with_timeout( Effect(Func(tester)), timeout=3, time=self.get_time([0.0, 1.0, 2.0, 3.0, 4.0, 5.0]), ) self.assertRaises(CustomException, perform_sequence, expected_intents, retrier)
def acquire_eff(self, blocking, timeout): """ Effect implementation of ``acquire`` method. :return: ``Effect`` of ``bool`` """ try: # Before acquiring, lets delete any child node which may be # lingering from previous acquire. Ideally this should happen only # when acquire is called again before release. This shouldn't # happen this is called after release or after is_acquired returns # False. In any case, its the safest thing to do yield self.release_eff() try: yield Effect(CreateNode(self.path)) except NodeExistsError: pass prefix = yield Effect(Func(uuid.uuid4)) # TODO: https://github.com/rackerlabs/otter/issues/1926 create_intent = CreateNode("{}/{}".format(self.path, prefix), value=self.identifier, ephemeral=True, sequence=True) self._node = yield Effect(create_intent) acquired = yield self._acquire_loop(blocking, timeout) if not acquired: yield self.release_eff() yield do_return(acquired) except Exception as e: yield self.release_eff() raise e
def configure_zfs(node, variants): """ Configure ZFS for use as a Flocker backend. :param INode node: The node to configure ZFS on. :param set variants: The set of variant configurations to use when :return Effect: """ return sequence([ run_remotely( username='******', address=node.address, commands=task_upgrade_kernel( distribution=node.distribution), ), node.reboot(), run_remotely( username='******', address=node.address, commands=sequence([ task_install_zfs( distribution=node.distribution, variants=variants), task_create_flocker_pool_file(), ]), ), Effect( Func(lambda: configure_ssh(node.address, 22))), ])
def converge_one_group(currently_converging, recently_converged, waiting, tenant_id, group_id, version, build_timeout, limited_retry_iterations, step_limits, execute_convergence=execute_convergence): """ Converge one group, non-concurrently, and clean up the dirty flag when done. :param Reference currently_converging: pset of currently converging groups :param Reference recently_converged: pmap of recently converged groups :param Reference waiting: pmap of waiting groups :param str tenant_id: the tenant ID of the group that is converging :param str group_id: the ID of the group that is converging :param version: version number of ZNode of the group's dirty flag :param number build_timeout: number of seconds to wait for servers to be in building before it's is timed out and deleted :param int limited_retry_iterations: number of iterations to wait for LIMITED_RETRY steps :param dict step_limits: Mapping of step class to number of executions allowed in a convergence cycle :param callable execute_convergence: like :func`execute_convergence`, to be used for test injection only """ mark_recently_converged = Effect(Func(time.time)).on( lambda time_done: recently_converged.modify( lambda rcg: rcg.set(group_id, time_done))) cvg = eff_finally( execute_convergence(tenant_id, group_id, build_timeout, waiting, limited_retry_iterations, step_limits), mark_recently_converged) try: result = yield non_concurrently(currently_converging, group_id, cvg) except ConcurrentError: # We don't need to spam the logs about this, it's to be expected return except NoSuchScalingGroupError: # NoSuchEndpoint occurs on a suspended or closed account yield err(None, 'converge-fatal-error') yield _clean_waiting(waiting, group_id) yield delete_divergent_flag(tenant_id, group_id, version) return except Exception: # We specifically don't clean up the dirty flag in the case of # unexpected errors, so convergence will be retried. yield err(None, 'converge-non-fatal-error') else: @match(ConvergenceIterationStatus) class clean_up(object): def Continue(): return Effect(Constant(None)) def Stop(): return delete_divergent_flag(tenant_id, group_id, version) def GroupDeleted(): # Delete the divergent flag to avoid any queued-up convergences # that will imminently fail. return delete_divergent_flag(tenant_id, group_id, -1) yield clean_up(result)
def configure_cluster(control_node, agent_nodes): """ Configure flocker-control and flocker-agent on a collection of nodes. :param bytes control_node: The address of the control node. :param list agent_nodes: List of addresses of agent nodes. """ return sequence([ run_remotely( username='******', address=control_node, commands=task_enable_flocker_control(), ), sequence([ sequence([ Effect(Func(lambda node=node: configure_ssh(node, 22))), run_remotely( username='******', address=node, commands=task_enable_flocker_agent( node_name=node, control_node=control_node, ), ), ]) for node in agent_nodes ]) ])
def provision_aws(node, package_source, distribution, variants): """ Provision flocker on this node. :param LibcloudNode node: Node to provision. :param PackageSource package_source: See func:`task_install_flocker` :param bytes distribution: See func:`task_install_flocker` :param set variants: The set of variant configurations to use when provisioning """ username = { 'fedora-20': 'fedora', 'centos-7': 'centos', }[distribution] commands = [] commands.append(run_remotely( username=username, address=node.address, commands=task_install_ssh_key(), )) pre_reboot_commands = [] if Variants.DISTRO_TESTING in variants: pre_reboot_commands.append( task_enable_updates_testing(distribution) ) if distribution in ('centos-7',): pre_reboot_commands.append( task_upgrade_kernel_centos() ) elif distribution in ('fedora-20',): pre_reboot_commands.append( task_upgrade_kernel(), ) commands.append(run_remotely( username='******', address=node.address, commands=sequence(pre_reboot_commands), )) commands.append(Effect(Func(node.reboot))) commands.append(run_remotely( username='******', address=node.address, commands=provision( package_source=package_source, distribution=node.distribution, variants=variants, ), )) return sequence(commands)
def test_added(self): """ total desired, pending and actual are added to cloud metrics """ metrics = [ GroupMetrics('t1', 'g1', 3, 2, 0), GroupMetrics('t2', 'g1', 4, 4, 1), GroupMetrics('t2', 'g', 100, 20, 0), GroupMetrics('t3', 'g3', 5, 3, 0) ] config = {"non-convergence-tenants": ["t1"]} m = {'collectionTime': 100000, 'ttlInSeconds': 5 * 24 * 60 * 60} md = merge(m, {'metricValue': 112, 'metricName': 'ord.desired'}) ma = merge(m, {'metricValue': 29, 'metricName': 'ord.actual'}) mp = merge(m, {'metricValue': 1, 'metricName': 'ord.pending'}) mt = merge(m, {'metricValue': 3, 'metricName': 'ord.tenants'}) mg = merge(m, {'metricValue': 4, 'metricName': 'ord.groups'}) mt1d = merge(m, {'metricValue': 3, 'metricName': 'ord.t1.desired'}) mt1a = merge(m, {'metricValue': 2, 'metricName': 'ord.t1.actual'}) mt1p = merge(m, {'metricValue': 0, 'metricName': 'ord.t1.pending'}) mt2d = merge(m, {'metricValue': 104, 'metricName': 'ord.t2.desired'}) mt2a = merge(m, {'metricValue': 24, 'metricName': 'ord.t2.actual'}) mt2p = merge(m, {'metricValue': 1, 'metricName': 'ord.t2.pending'}) mt3d = merge(m, {'metricValue': 5, 'metricName': 'ord.t3.desired'}) mt3a = merge(m, {'metricValue': 3, 'metricName': 'ord.t3.actual'}) mt3p = merge(m, {'metricValue': 0, 'metricName': 'ord.t3.pending'}) cd = merge(m, {'metricValue': 109, 'metricName': 'ord.conv_desired'}) ca = merge(m, {'metricValue': 27, 'metricName': 'ord.conv_actual'}) cdiv = merge(m, { 'metricValue': 82, 'metricName': 'ord.conv_divergence' }) req_data = [ md, ma, mp, mt, mg, mt1d, mt1a, mt1p, mt2d, mt2a, mt2p, mt3d, mt3a, mt3p, cd, ca, cdiv ] log = mock_log() seq = [(Func(time.time), const(100)), (service_request(ServiceType.CLOUD_METRICS_INGEST, "POST", "ingest", data=req_data, log=log).intent, noop)] eff = add_to_cloud_metrics( m['ttlInSeconds'], 'ord', metrics, 3, # number of tenants config, log) self.assertIsNone(perform_sequence(seq, eff)) log.msg.assert_called_once_with( 'total desired: {td}, total_actual: {ta}, total pending: {tp}', td=112, ta=29, tp=1)
def as_effect(self): """Produce a :obj:`Effect` to create a stack.""" eff = Effect(Func(uuid4)) def got_uuid(uuid): stack_config = append_stack_uuid(self.stack_config, uuid) return create_stack(thaw(stack_config)).on( _success_reporter('Waiting for stack to create')) return eff.on(got_uuid)
def add_to_cloud_metrics(ttl, region, group_metrics, num_tenants, config, log=None, _print=False): """ Add total number of desired, actual and pending servers of a region to Cloud metrics. :param str region: which region's metric is collected :param group_metrics: List of :obj:`GroupMetric` :param int num_tenants: total number of tenants :param dict config: Config json dict containing convergence tenants info :param log: Optional logger :param bool _print: Should it print activity on stdout? Useful when running as a script :return: `Effect` with None """ epoch = yield Effect(Func(time.time)) metric_part = {'collectionTime': int(epoch * 1000), 'ttlInSeconds': ttl} tenanted_metrics, total = calc_total(group_metrics) if log is not None: log.msg( 'total desired: {td}, total_actual: {ta}, total pending: {tp}', td=total.desired, ta=total.actual, tp=total.pending) if _print: print('total desired: {}, total actual: {}, total pending: {}'.format( total.desired, total.actual, total.pending)) metrics = [('desired', total.desired), ('actual', total.actual), ('pending', total.pending), ('tenants', num_tenants), ('groups', len(group_metrics))] for tenant_id, metric in sorted(tenanted_metrics.items()): metrics.append(("{}.desired".format(tenant_id), metric.desired)) metrics.append(("{}.actual".format(tenant_id), metric.actual)) metrics.append(("{}.pending".format(tenant_id), metric.pending)) # convergence tenants desired and actual conv_tenants = keyfilter( partial(tenant_is_enabled, get_config_value=lambda k: get_in([k], config)), tenanted_metrics) conv_desired = sum(m.desired for m in conv_tenants.itervalues()) conv_actual = sum(m.actual for m in conv_tenants.itervalues()) metrics.extend( [("conv_desired", conv_desired), ("conv_actual", conv_actual), ("conv_divergence", conv_desired - conv_actual)]) data = [merge(metric_part, {'metricValue': value, 'metricName': '{}.{}'.format(region, metric)}) for metric, value in metrics] yield service_request(ServiceType.CLOUD_METRICS_INGEST, 'POST', 'ingest', data=data, log=log)
def test_acquire_blocking_success(self): """ acquire_eff creates child, realizes its not the smallest. Tries again every 0.01 seconds until it succeeds """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000001")), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0)), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0.2)), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000001"]))] self.assertTrue(perform_sequence(seq, self.lock.acquire_eff(True, 1)))
def as_effect(self): """Produce a :obj:`Effect` to create a server.""" eff = Effect(Func(generate_server_name)) def got_name(random_name): server_config = set_server_name(self.server_config, random_name) return create_server(thaw(server_config)) return eff.on(got_name).on( success=_success_reporter('waiting for server to become active'), error=_failure_reporter(CreateServerConfigurationError, CreateServerOverQuoteError))
def get_recently_converged_groups(recently_converged, interval): """ Return a list of recently converged groups, and garbage-collect any groups in the recently_converged map that are no longer 'recent'. """ # STM would be cool but this is synchronous so whatever recent = yield recently_converged.read() now = yield Effect(Func(time.time)) to_remove = [group for group in recent if now - recent[group] > interval] cleaned = reduce(lambda m, g: m.remove(g), to_remove, recent) if recent != cleaned: yield recently_converged.modify(lambda _: cleaned) yield do_return(cleaned.keys())
def test_acquire_blocking_no_timeout(self): """ When acquire_eff is called without timeout, it creates child, realizes its not the smallest, tries again every 0.1 seconds without checking time and succeeds if its the smallest node """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000001")), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Func(time.time), const(0)), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (Delay(0.1), noop), (GetChildren("/testlock"), const(["prefix0000000001"]))] self.assertTrue( perform_sequence(seq, self.lock.acquire_eff(True, None)))
def group_steps(group): """ Return Effect of list of steps that would be performed on the group if convergence is triggered on it with desired=actual """ now_dt = yield Effect(Func(datetime.utcnow)) all_data_eff = convergence_exec_data(group["tenantId"], group["groupId"], now_dt, get_executor) all_data = yield Effect(TenantScope(all_data_eff, group["tenantId"])) (executor, scaling_group, group_state, desired_group_state, resources) = all_data desired_group_state.desired = len(resources['servers']) steps = executor.plan(desired_group_state, datetime_to_epoch(now_dt), 3600, **resources) yield do_return(steps)
def test_acquire_success(self): """ acquire_eff creates child and gets lock as it is the smallest one """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), conste(NodeExistsError())), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000000")), (GetChildren("/testlock"), const(["prefix0000000000"]))] self.assertTrue( perform_sequence(seq, self.lock.acquire_eff(False, None)))
def test_acquire_create_path_success(self): """ acquire_eff creates provided path if it doesn't exist """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000000")), (GetChildren("/testlock"), const(["prefix0000000000"]))] self.assertTrue( perform_sequence(seq, self.lock.acquire_eff(False, None)))
def __call__(self, exc_info): """ Determine whether retry should occur, based on the exception info. """ exc_type, exc_value, exc_traceback = exc_info failure = Failure(exc_value, exc_type, exc_traceback) def doit(): if self.can_retry(failure): interval = self.next_interval(failure) return Effect(Delay(interval)).on(lambda r: True) else: return False return Effect(Func(doit))
def test_acquire_delete_child(self): """ acquire_eff deletes existing child if it exists """ self.lock._node = "/testlock/prefix000000002" seq = [(DeleteNode(path="/testlock/prefix000000002", version=-1), noop), (zk.CreateNode("/testlock"), conste(NodeExistsError())), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000000")), (GetChildren("/testlock"), const(["prefix0000000000"]))] self.assertTrue( perform_sequence(seq, self.lock.acquire_eff(False, None)))
def test_perform_retry_retries_on_error(self): """ When the specified effect raises, it is retried when should_retry returns an Effect of True. """ func = _repeated_effect_func(lambda: _raise(RuntimeError("foo")), lambda: "final") def should_retry(exc_info): if (exc_info[0] is RuntimeError and exc_info[1].message == "foo"): return Effect(Constant(True)) else: return Effect(Constant(False)) retry = Retry(effect=Effect(Func(func)), should_retry=should_retry) result = sync_perform(self.dispatcher, Effect(retry)) self.assertEqual(result, "final")
def test_one_retry(self): """ Retry the effect if it fails once. """ divisors = [0, 1] def tester(): x = divisors.pop(0) return 1 / x seq = [ (Delay(1), lambda ignore: None), ] retrier = retry_effect_with_timeout(Effect(Func(tester)), 10, time=self.get_time()) result = perform_sequence(seq, retrier) self.assertEqual(result, 1 / 1)
def test_acquire_other_error(self): """ If acquire_eff internally raises any error then it tries to delete child node before returning. """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000001")), (GetChildren("/testlock"), conste(SessionExpiredError())), (DeleteNode(path="/testlock/prefix0000000001", version=-1), conste(SessionExpiredError()))] self.assertRaises(SessionExpiredError, perform_sequence, seq, self.lock.acquire_eff(True, 0.3))
def test_normal_use(self): """Tests normal usage.""" stack_config = pmap({'stack_name': 'baz', 'foo': 'bar'}) new_stack_config = pmap({'stack_name': 'baz_foo', 'foo': 'bar'}) self.create = CreateStack(stack_config) self.seq = [ (Func(uuid4), lambda _: 'foo'), (create_stack(thaw(new_stack_config)).intent, lambda _: (StubResponse(200, {}), {'stack': {}})), (Log('request-create-stack', ANY), lambda _: None) ] reason = 'Waiting for stack to create' result = perform_sequence(self.seq, self.create.as_effect()) self.assertEqual(result, (StepResult.RETRY, [ErrorReason.String(reason)]))
def convergence_succeeded(executor, scaling_group, group_state, resources): """ Handle convergence success """ if group_state.status == ScalingGroupStatus.DELETING: # servers have been deleted. Delete the group for real yield Effect(DeleteGroup(tenant_id=scaling_group.tenant_id, group_id=scaling_group.uuid)) yield do_return(ConvergenceIterationStatus.GroupDeleted()) elif group_state.status == ScalingGroupStatus.ERROR: yield Effect(UpdateGroupStatus(scaling_group=scaling_group, status=ScalingGroupStatus.ACTIVE)) yield cf_msg('group-status-active', status=ScalingGroupStatus.ACTIVE.name) # update servers cache with latest servers. # See [Convergence servers cache] comment on top of the file. now = yield Effect(Func(datetime.utcnow)) yield executor.update_cache(scaling_group, now, include_deleted=False, **resources) yield do_return(ConvergenceIterationStatus.Stop())
def test_acquire_nonblocking_fails(self): """ acquire_eff creates child and returns False immediately after finding its not the smallest child when blocking=False. It deletes child node before returning. """ seq = [(Constant(None), noop), (zk.CreateNode("/testlock"), const("/testlock")), (Func(uuid.uuid4), const("prefix")), (zk.CreateNode("/testlock/prefix", value="id", ephemeral=True, sequence=True), const("/testlock/prefix0000000001")), (GetChildren("/testlock"), const(["prefix0000000000", "prefix0000000001"])), (DeleteNode(path="/testlock/prefix0000000001", version=-1), noop)] self.assertFalse( perform_sequence(seq, self.lock.acquire_eff(False, None)))
def test_create_server_noname(self): """ :obj:`CreateServer.as_effect`, when no name is provided in the launch config, will generate the name will from scratch. This only verifies intent; result reporting is tested in :meth:`test_create_server`. """ create = CreateServer( server_config=freeze({'server': {'flavorRef': '1'}})) eff = create.as_effect() self.assertEqual(eff.intent, Func(generate_server_name)) eff = resolve_effect(eff, 'random-name') self.assertEqual( eff.intent, service_request( ServiceType.CLOUD_SERVERS, 'POST', 'servers', data={'server': {'name': 'random-name', 'flavorRef': '1'}}, success_pred=has_code(202), reauth_codes=(401,)).intent)