def start_cluster(self, reactor): """ Provision cloud cluster for acceptance tests. :return Cluster: The cluster to connect to for acceptance tests. """ metadata = { 'purpose': 'acceptance-testing', 'distribution': self.distribution, } metadata.update(self.metadata) for index in range(self.num_nodes): name = "acceptance-test-%s-%d" % (self.creator, index) try: print "Creating node %d: %s" % (index, name) node = self.provisioner.create_node( name=name, distribution=self.distribution, metadata=metadata, ) except: print "Error creating node %d: %s" % (index, name) print "It may have leaked into the cloud." raise yield remove_known_host(reactor, node.address) self.nodes.append(node) del node commands = parallel([ node.provision(package_source=self.package_source, variants=self.variants) for node in self.nodes ]) if self.dataset_backend == DatasetBackend.zfs: zfs_commands = parallel([ configure_zfs(node, variants=self.variants) for node in self.nodes ]) commands = commands.on(success=lambda _: zfs_commands) yield perform(make_dispatcher(reactor), commands) cluster = yield configured_cluster_for_nodes( reactor, generate_certificates( make_cluster_id( TestTypes.ACCEPTANCE, _provider_for_cluster_id(self.dataset_backend), ), self.nodes), self.nodes, self.dataset_backend, self.dataset_backend_configuration, _save_backend_configuration(self.dataset_backend, self.dataset_backend_configuration) ) returnValue(cluster)
def get_clb_contents(): """ Get Rackspace Cloud Load Balancer contents as list of `CLBNode`. CLB health monitor information is also returned as a pmap of :obj:`CLB` objects mapped on LB ID. :return: Effect of (``list`` of :obj:`CLBNode`, `pmap` of :obj:`CLB`) :rtype: :obj:`Effect` """ # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [_retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids] healthmon_reqs = [ _retry(get_clb_health_monitor(lb_id).on(error=gone(None))) for lb_id in lb_ids] all_nodes_hms = yield parallel(node_reqs + healthmon_reqs) all_nodes, hms = all_nodes_hms[:len(lb_ids)], all_nodes_hms[len(lb_ids):] lb_nodes = { lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes)} clbs = { str(lb_id): CLB(bool(health_mon)) for lb_id, health_mon in zip(lb_ids, hms) if health_mon is not None} draining = [n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING] feeds = yield parallel( [_retry(get_clb_node_feed(n.description.lb_id, n.node_id).on( error=gone(None))) for n in draining] ) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: node.drained_at = extract_clb_drained_at(feed) return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return(( list(filter(bool, nodes)), pmap(keyfilter(lambda k: k not in deleted_lbs, clbs))))
def get_clb_contents(): """Get Rackspace Cloud Load Balancer contents as list of `CLBNode`.""" # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [ _retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids ] all_nodes = yield parallel(node_reqs) lb_nodes = { lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes) } draining = [ n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING ] feeds = yield parallel([ _retry( get_clb_node_feed(n.description.lb_id, n.node_id).on(error=gone(None))) for n in draining ]) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None ]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: return assoc_obj(node, drained_at=extract_CLB_drained_at(feed)) else: return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return(list(filter(bool, nodes)))
def get_all_launch_server_data( tenant_id, group_id, now, get_scaling_group_servers=get_scaling_group_servers, get_clb_contents=get_clb_contents, get_rcv3_contents=get_rcv3_contents): """ Gather all launch_server data relevant for convergence w.r.t given time, in parallel where possible. Returns an Effect of {'servers': [NovaServer], 'lb_nodes': [LBNode], 'lbs': pmap(LB_ID -> CLB)}. """ return parallel([ get_scaling_group_servers(tenant_id, group_id, now).on( map(NovaServer.from_server_details_json)).on(list), get_clb_contents(), get_rcv3_contents() ]).on( lambda (servers, clb_nodes_and_clbs, rcv3_nodes): { 'servers': servers, 'lb_nodes': clb_nodes_and_clbs[0] + rcv3_nodes, 'lbs': clb_nodes_and_clbs[1] })
def steps_to_effect(steps): """Turns a collection of :class:`IStep` providers into an effect.""" # Treat unknown errors as RETRY. return parallel([ s.as_effect().on(error=lambda e: (StepResult.RETRY, [ErrorReason.Exception(e)])) for s in steps])
def _log_remove_from_clb(steps): lbs = groupby(lambda s: s.lb_id, steps) effs = [ cf_msg('convergence-remove-clb-nodes', lb_id=lb, nodes=sorted(concat(s.node_ids for s in lbsteps))) for lb, lbsteps in sorted(lbs.iteritems())] return parallel(effs)
def _log_bulk_rcv3(event, steps): by_lbs = groupby(lambda s: s[0], concat(s.lb_node_pairs for s in steps)) effs = [ cf_msg(event, lb_id=lb_id, servers=sorted(p[1] for p in pairs)) for lb_id, pairs in sorted(by_lbs.iteritems()) ] return parallel(effs)
def configure_cluster(cluster, dataset_backend_configuration): """ Configure flocker-control, flocker-dataset-agent and flocker-container-agent on a collection of nodes. :param Cluster cluster: Description of the cluster to configure. :param dict dataset_backend_configuration: Configuration parameters to supply to the dataset backend. """ return sequence([ run_remotely( username='******', address=cluster.control_node.address, commands=sequence([ task_install_control_certificates( cluster.certificates.cluster.certificate, cluster.certificates.control.certificate, cluster.certificates.control.key), task_enable_flocker_control(cluster.control_node.distribution), if_firewall_available( cluster.control_node.distribution, task_open_control_firewall( cluster.control_node.distribution)), ]), ), parallel([ sequence([ run_remotely( username='******', address=node.address, commands=sequence([ task_install_node_certificates( cluster.certificates.cluster.certificate, certnkey.certificate, certnkey.key), task_install_api_certificates( cluster.certificates.user.certificate, cluster.certificates.user.key), task_enable_docker(node.distribution), if_firewall_available( node.distribution, open_firewall_for_docker_api(node.distribution), ), task_configure_flocker_agent( control_node=cluster.control_node.address, dataset_backend=cluster.dataset_backend, dataset_backend_configuration=( dataset_backend_configuration), ), task_enable_docker_plugin(node.distribution), task_enable_flocker_agent( distribution=node.distribution, ), ]), ), ]) for certnkey, node in zip(cluster.certificates.nodes, cluster.agent_nodes) ]) ])
def start_nodes(self, reactor): """ Provision cloud nodes for acceptance tests. :return list: List of addresses of nodes to connect to, for acceptance tests. """ metadata = { 'purpose': 'acceptance-testing', 'distribution': self.distribution, } metadata.update(self.metadata) for index in range(2): name = "acceptance-test-%s-%d" % (self.creator, index) try: print "Creating node %d: %s" % (index, name) node = self.provisioner.create_node( name=name, distribution=self.distribution, metadata=metadata, ) except: print "Error creating node %d: %s" % (index, name) print "It may have leaked into the cloud." raise yield remove_known_host(reactor, node.address) self.nodes.append(node) del node commands = parallel([ node.provision(package_source=self.package_source, variants=self.variants) for node in self.nodes ]) if self.dataset_backend == DatasetBackend.zfs: zfs_commands = parallel([ configure_zfs(node, variants=self.variants) for node in self.nodes ]) commands = commands.on(success=lambda _: zfs_commands) yield perform(make_dispatcher(reactor), commands) returnValue(self.nodes)
def groups_steps(groups, reactor, store, cass_client, authenticator, conf): """ Return [(group, steps)] list """ eff = parallel(map(group_steps, groups)) disp = get_full_dispatcher( reactor, authenticator, mock_log(), get_service_configs(conf), "kzclient", store, "supervisor", cass_client) return perform(disp, eff).addCallback(lambda steps: zip(groups, steps))
def groups_steps(groups, reactor, store, cass_client, authenticator, conf): """ Return [(group, steps)] list """ eff = parallel(map(group_steps, groups)) disp = get_full_dispatcher(reactor, authenticator, mock_log(), get_service_configs(conf), "kzclient", store, "supervisor", cass_client) return perform(disp, eff).addCallback(lambda steps: zip(groups, steps))
def get_orgs_repos(name): """ Fetch ALL of the repos that a user has access to, in any organization. """ req = get_orgs(name) req = req.on( success=lambda org_names: parallel(map(get_org_repos, org_names))) req = req.on(success=lambda repo_lists: reduce(operator.add, repo_lists)) return req
def _log_remove_from_clb(steps): lbs = groupby(lambda s: s.lb_id, steps) effs = [ cf_msg('convergence-remove-clb-nodes', lb_id=lb, nodes=sorted(concat(s.node_ids for s in lbsteps))) for lb, lbsteps in sorted(lbs.iteritems()) ] return parallel(effs)
def conv_pause_group_eff(group, transaction_id): """ Pause scaling group of convergence enabled tenant """ eff = parallel([Effect(ModifyGroupStatePaused(group, True)), delete_divergent_flag(group.tenant_id, group.uuid, -1)]) return with_log(eff, transaction_id=transaction_id, tenant_id=group.tenant_id, scaling_group_id=group.uuid).on(lambda _: None)
def test_parallel(self): """ 'parallel' results in a list of results of the given effects, in the same order that they were passed to parallel. """ d = perform( _dispatcher(None), parallel([Effect(Constant('a')), Effect(Constant('b'))])) self.assertEqual(self.successResultOf(d), ['a', 'b'])
def on_listing_pools(lblist_result): _, body = lblist_result return parallel([ service_request(ServiceType.RACKCONNECT_V3, 'GET', append_segments('load_balancer_pools', lb_pool['id'], 'nodes')).on( partial(on_listing_nodes, RCv3Description(lb_id=lb_pool['id']))) for lb_pool in body ])
def conv_resume_group_eff(trans_id, group): """ Resume scaling group of convergence enabled tenant """ eff = parallel([ Effect(ModifyGroupStatePaused(group, False)), mark_divergent(group.tenant_id, group.uuid).on( lambda _: msg("mark-dirty-success"))]) return with_log(eff, transaction_id=trans_id, tenant_id=group.tenant_id, scaling_group_id=group.uuid).on(lambda _: None)
def get_orgs_repos(name): """ Fetch ALL of the repos that a user has access to, in any organization. :return: An Effect resulting in a list of repositories. """ req = get_orgs(name) req = req.on(lambda org_names: parallel(map(get_org_repos, org_names))) req = req.on(lambda repo_lists: reduce(operator.add, repo_lists)) return req
def _log_set_metadata(steps): by_kv = groupby(lambda s: (s.key, s.value), steps) effs = [ cf_msg('convergence-set-server-metadata', servers=sorted(s.server_id for s in kvsteps), key=key, value=value) for (key, value), kvsteps in sorted(by_kv.iteritems()) ] return parallel(effs)
def _(steps): by_cfg = groupby(lambda s: s.server_config, steps) effs = [ cf_msg( 'convergence-create-servers', num_servers=len(cfg_steps), server_config=dict(cfg)) # We sort the items with `thaw` because PMap does not support # comparison for cfg, cfg_steps in sorted(by_cfg.iteritems(), key=thaw)] return parallel(effs)
def _log_add_nodes_clb(steps): lbs = defaultdict(list) for step in steps: for (address, config) in step.address_configs: lbs[step.lb_id].append('%s:%s' % (address, config.port)) effs = [ cf_msg('convergence-add-clb-nodes', lb_id=lb_id, addresses=sorted(addresses)) for lb_id, addresses in sorted(lbs.iteritems()) ] return parallel(effs)
def get_orgs_repos(name): """ Fetch ALL of the repos that a user has access to, in any organization. """ req = get_orgs(name) req = req.on( success=lambda org_names: parallel(map(get_org_repos, org_names))) req = req.on( success=lambda repo_lists: reduce(operator.add, repo_lists)) return req
def _log_set_metadata(steps): by_kv = groupby(lambda s: (s.key, s.value), steps) effs = [ cf_msg( 'convergence-set-server-metadata', servers=sorted(s.server_id for s in kvsteps), key=key, value=value ) for (key, value), kvsteps in sorted(by_kv.iteritems()) ] return parallel(effs)
def _(steps): by_cfg = groupby(lambda s: s.server_config, steps) effs = [ cf_msg('convergence-create-servers', num_servers=len(cfg_steps), server_config=dict(cfg)) # We sort the items with `thaw` because PMap does not support # comparison for cfg, cfg_steps in sorted(by_cfg.iteritems(), key=thaw) ] return parallel(effs)
def _log_change_clb_node(steps): lbs = groupby(lambda s: (s.lb_id, s.condition, s.weight, s.type), steps) effs = [ cf_msg('convergence-change-clb-nodes', lb_id=lb, nodes=sorted([s.node_id for s in grouped_steps]), condition=condition.name, weight=weight, type=node_type.name) for (lb, condition, weight, node_type), grouped_steps in sorted(lbs.iteritems()) ] return parallel(effs)
def perform_get_children_with_stats(kz_client, dispatcher, intent): """ Perform :obj:`GetChildrenWithStats`. Must be partialed with ``kz_client``. :param kz_client: txKazoo client :param dispatcher: dispatcher, supplied by perform :param GetChildrenWithStats intent: the intent """ path = intent.path children = yield Effect(GetChildren(path)) stats = yield parallel(Effect(GetStat(path + "/" + p)) for p in children) yield do_return([c_and_s for c_and_s in zip(children, stats) if c_and_s[1] is not None])
def conv_pause_group_eff(group, transaction_id): """ Pause scaling group of convergence enabled tenant """ eff = parallel([ Effect(ModifyGroupStatePaused(group, True)), delete_divergent_flag(group.tenant_id, group.uuid, -1) ]) return with_log(eff, transaction_id=transaction_id, tenant_id=group.tenant_id, scaling_group_id=group.uuid).on(lambda _: None)
def on_listing_pools(lblist_result): _, body = lblist_result return parallel([ service_request( ServiceType.RACKCONNECT_V3, 'GET', append_segments('load_balancer_pools', lb_pool['id'], 'nodes')).on( partial( on_listing_nodes, RCv3Description(lb_id=lb_pool['id']))) for lb_pool in body ])
def configure_cluster(cluster, dataset_backend_configuration): """ Configure flocker-control, flocker-dataset-agent and flocker-container-agent on a collection of nodes. :param Cluster cluster: Description of the cluster to configure. :param dict dataset_backend_configuration: Configuration parameters to supply to the dataset backend. """ return sequence([ run_remotely( username='******', address=cluster.control_node.address, commands=sequence([ task_install_control_certificates( cluster.certificates.cluster.certificate, cluster.certificates.control.certificate, cluster.certificates.control.key), task_enable_flocker_control(cluster.control_node.distribution), ]), ), parallel([ sequence([ run_remotely( username='******', address=node.address, commands=sequence([ task_install_node_certificates( cluster.certificates.cluster.certificate, certnkey.certificate, certnkey.key), task_install_api_certificates( cluster.certificates.user.certificate, cluster.certificates.user.key), task_enable_docker(node.distribution), task_configure_flocker_agent( control_node=cluster.control_node.address, dataset_backend=cluster.dataset_backend, dataset_backend_configuration=( dataset_backend_configuration ), ), task_enable_docker_plugin(node.distribution), task_enable_flocker_agent( distribution=node.distribution, )]), ), ]) for certnkey, node in zip(cluster.certificates.nodes, cluster.agent_nodes) ]) ])
def get_clb_contents(): """Get Rackspace Cloud Load Balancer contents as list of `CLBNode`.""" # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [_retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids] all_nodes = yield parallel(node_reqs) lb_nodes = {lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes)} draining = [n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING] feeds = yield parallel( [_retry(get_clb_node_feed(n.description.lb_id, n.node_id).on( error=gone(None))) for n in draining] ) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: return assoc_obj(node, drained_at=extract_CLB_drained_at(feed)) else: return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return(list(filter(bool, nodes)))
def conv_resume_group_eff(trans_id, group): """ Resume scaling group of convergence enabled tenant """ eff = parallel([ Effect(ModifyGroupStatePaused(group, False)), mark_divergent(group.tenant_id, group.uuid).on(lambda _: msg("mark-dirty-success")) ]) return with_log(eff, transaction_id=trans_id, tenant_id=group.tenant_id, scaling_group_id=group.uuid).on(lambda _: None)
async def test_parallel(self, dispatcher): """ 'parallel' results in a list of results of the given effects, in the same order that they were passed to parallel. """ d = await asyncio_perform( dispatcher, parallel([ Effect(Constant('a')), Effect( Delay(0.01)).on(success=lambda _: Effect(Constant('...'))), Effect(Constant('b')) ])) assert d == ['a', '...', 'b']
def perform_get_children_with_stats(kz_client, dispatcher, intent): """ Perform :obj:`GetChildrenWithStats`. Must be partialed with ``kz_client``. :param kz_client: txKazoo client :param dispatcher: dispatcher, supplied by perform :param GetChildrenWithStats intent: the intent """ path = intent.path children = yield Effect(GetChildren(path)) stats = yield parallel(Effect(GetStat(path + '/' + p)) for p in children) yield do_return([ c_and_s for c_and_s in zip(children, stats) if c_and_s[1] is not None ])
def log_steps(steps): """ Log some steps (to cloud feeds). In general this tries to reduce the number of Log calls to a reasonable minimum, based on how steps are usually used. For example, multiple :obj:`SetMetadataItemOnServer` that are setting the same key/value on a server will be merged into one Log call that shows all the servers being affected. """ steps_by_type = groupby(type, steps) effs = [] for step_type, typed_steps in steps_by_type.iteritems(): if step_type in _loggers: effs.append(_loggers[step_type](typed_steps)) return parallel(effs)
def _run_on_all_nodes(nodes, task): """ Run some commands on some nodes. :param nodes: An iterable of ``Node`` instances where the commands should be run. :param task: A one-argument callable which is called with each ``Node`` and should return the ``Effect`` to run on that node. :return: An ``Effect`` that runs the commands on a group of nodes. """ return parallel( list( run_remotely( username='******', address=node.address, commands=task(node), ) for node in nodes))
async def test_parallel_with_error(self, dispatcher): """ 'parallel' results in a list of results of the given effects, in the same order that they were passed to parallel. """ @do def fail(): yield Effect(Delay(0.01)) raise RuntimeError('My error') future = asyncio_perform( dispatcher, parallel([ Effect(Delay(1)), Effect(Delay(1)), fail(), ])) with pytest.raises(FirstError): await future
def _run_on_all_nodes(nodes, task): """ Run some commands on some nodes. :param nodes: An iterable of ``Node`` instances where the commands should be run. :param task: A one-argument callable which is called with each ``Node`` and should return the ``Effect`` to run on that node. :return: An ``Effect`` that runs the commands on a group of nodes. """ return parallel(list( run_remotely( username='******', address=node.address, commands=task(node), ) for node in nodes ))
def get_all_convergence_data( tenant_id, group_id, now, get_scaling_group_servers=get_scaling_group_servers, get_clb_contents=get_clb_contents, get_rcv3_contents=get_rcv3_contents): """ Gather all data relevant for convergence w.r.t given time, in parallel where possible. Returns an Effect of ([NovaServer], [LBNode]). """ eff = parallel( [get_scaling_group_servers(tenant_id, group_id, now) .on(map(NovaServer.from_server_details_json)).on(list), get_clb_contents(), get_rcv3_contents()] ).on(lambda (servers, clb, rcv3): (servers, list(concat([clb, rcv3])))) return eff
def get_all_launch_server_data( tenant_id, group_id, now, get_scaling_group_servers=get_scaling_group_servers, get_clb_contents=get_clb_contents, get_rcv3_contents=get_rcv3_contents): """ Gather all launch_server data relevant for convergence w.r.t given time, in parallel where possible. Returns an Effect of {'servers': [NovaServer], 'lb_nodes': [LBNode]}. """ eff = parallel([ get_scaling_group_servers(tenant_id, group_id, now).on( map(NovaServer.from_server_details_json)).on(list), get_clb_contents(), get_rcv3_contents() ]).on(lambda (servers, clb, rcv3): { 'servers': servers, 'lb_nodes': list(concat([clb, rcv3])) }) return eff
def get_all_launch_server_data( tenant_id, group_id, now, get_scaling_group_servers=get_scaling_group_servers, get_clb_contents=get_clb_contents, get_rcv3_contents=get_rcv3_contents): """ Gather all launch_server data relevant for convergence w.r.t given time, in parallel where possible. Returns an Effect of {'servers': [NovaServer], 'lb_nodes': [LBNode], 'lbs': pmap(LB_ID -> CLB)}. """ return parallel( [get_scaling_group_servers(tenant_id, group_id, now) .on(map(NovaServer.from_server_details_json)).on(list), get_clb_contents(), get_rcv3_contents()] ).on(lambda (servers, clb_nodes_and_clbs, rcv3_nodes): { 'servers': servers, 'lb_nodes': clb_nodes_and_clbs[0] + rcv3_nodes, 'lbs': clb_nodes_and_clbs[1] })
def main(reactor, args, base_path, top_level): """ :param reactor: Reactor to use. :param list args: The arguments passed to the script. :param FilePath base_path: The executable being run. :param FilePath top_level: The top-level of the flocker repository. """ options = RunOptions(top_level=top_level) add_destination(eliot_output) try: options.parseOptions(args) except UsageError as e: sys.stderr.write("%s: %s\n" % (base_path.basename(), e)) raise SystemExit(1) runner = options.runner try: nodes = yield runner.start_nodes(reactor) ca_directory = FilePath(mkdtemp()) print("Generating certificates in: {}".format(ca_directory.path)) certificates = Certificates.generate(ca_directory, nodes[0].address, len(nodes)) yield perform( make_dispatcher(reactor), parallel([ run_remotely( username='******', address=node.address, commands=task_pull_docker_images() ) for node in nodes ]), ) control_node = nodes[0] dataset_backend = options.dataset_backend yield perform( make_dispatcher(reactor), configure_cluster(control_node=control_node, agent_nodes=nodes, certificates=certificates, dataset_backend=dataset_backend)) result = yield run_tests( reactor=reactor, nodes=nodes, control_node=control_node, agent_nodes=nodes, dataset_backend=dataset_backend, trial_args=options['trial-args'], certificates_path=ca_directory) except: result = 1 raise finally: # Unless the tests failed, and the user asked to keep the nodes, we # delete them. if not (result != 0 and options['keep']): runner.stop_nodes(reactor) elif options['keep']: print "--keep specified, not destroying nodes." print ("To run acceptance tests against these nodes, " "set the following environment variables: ") environment_variables = { 'FLOCKER_ACCEPTANCE_NODES': ':'.join(node.address for node in nodes), 'FLOCKER_ACCEPTANCE_CONTROL_NODE': control_node.address, 'FLOCKER_ACCEPTANCE_AGENT_NODES': ':'.join(node.address for node in nodes), 'FLOCKER_ACCEPTANCE_VOLUME_BACKEND': dataset_backend.name, 'FLOCKER_ACCEPTANCE_API_CERTIFICATES_PATH': ca_directory.path, } for environment_variable in environment_variables: print "export {name}={value};".format( name=environment_variable, value=environment_variables[environment_variable], ) raise SystemExit(result)
def start_cluster(self, reactor): """ Provision cloud cluster for acceptance tests. :return Cluster: The cluster to connect to for acceptance tests. """ metadata = { 'distribution': self.distribution, } metadata.update(self.identity.metadata) metadata.update(self.metadata) # Try to make names unique even if the same creator is starting # multiple clusters at the same time. This lets other code use the # name as a way to identify nodes. This is only necessary in one # place, the node creation code, to perform cleanup when the create # operation fails in a way such that it isn't clear if the instance has # been created or not. random_tag = os.urandom(8).encode("base64").strip("\n=") print "Assigning random tag:", random_tag for index in range(self.num_nodes): name = "%s-%s-%s-%d" % ( self.identity.prefix, self.creator, random_tag, index, ) try: print "Creating node %d: %s" % (index, name) node = self.provisioner.create_node( name=name, distribution=self.distribution, metadata=metadata, ) except: print "Error creating node %d: %s" % (index, name) print "It may have leaked into the cloud." raise yield remove_known_host(reactor, node.address) self.nodes.append(node) del node commands = parallel([ node.provision(package_source=self.package_source, variants=self.variants) for node in self.nodes ]) if self.dataset_backend == DatasetBackend.zfs: zfs_commands = parallel([ configure_zfs(node, variants=self.variants) for node in self.nodes ]) commands = commands.on(success=lambda _: zfs_commands) yield perform(make_dispatcher(reactor), commands) cluster = yield configured_cluster_for_nodes( reactor, generate_certificates( self.identity.name, self.identity.id, self.nodes, self.cert_path, ), self.nodes, self.dataset_backend, self.dataset_backend_configuration, _save_backend_configuration(self.dataset_backend, self.dataset_backend_configuration), logging_config=self.config.get('logging'), ) returnValue(cluster)
def main(reactor, args, base_path, top_level): """ :param reactor: Reactor to use. :param list args: The arguments passed to the script. :param FilePath base_path: The executable being run. :param FilePath top_level: The top-level of the flocker repository. """ options = RunOptions(top_level=top_level) add_destination(eliot_output) try: options.parseOptions(args) except UsageError as e: sys.stderr.write("%s: %s\n" % (base_path.basename(), e)) raise SystemExit(1) runner = options.runner from flocker.common.script import eliot_logging_service log_writer = eliot_logging_service( destination=FileDestination( file=open("%s.log" % (base_path.basename(),), "a") ), reactor=reactor, capture_stdout=False) log_writer.startService() reactor.addSystemEventTrigger( 'before', 'shutdown', log_writer.stopService) cluster = None results = [] setup_succeeded = False reached_finally = False def cluster_cleanup(): if not reached_finally: print "interrupted..." print "stopping cluster" return runner.stop_cluster(reactor) cleanup_trigger_id = reactor.addSystemEventTrigger('before', 'shutdown', cluster_cleanup) try: yield runner.ensure_keys(reactor) cluster = yield runner.start_cluster(reactor) if options['distribution'] in ('centos-7',): remote_logs_file = open("remote_logs.log", "a") for node in cluster.all_nodes: results.append(capture_journal(reactor, node.address, remote_logs_file) ) elif options['distribution'] in ('ubuntu-14.04',): remote_logs_file = open("remote_logs.log", "a") for node in cluster.all_nodes: results.append(capture_upstart(reactor, node.address, remote_logs_file) ) gather_deferreds(results) if not options["no-pull"]: yield perform( make_dispatcher(reactor), parallel([ run_remotely( username='******', address=node.address, commands=task_pull_docker_images() ) for node in cluster.agent_nodes ]), ) setup_succeeded = True result = yield run_tests( reactor=reactor, cluster=cluster, trial_args=options['trial-args']) finally: reached_finally = True # We delete the nodes if the user hasn't asked to keep them # or if we failed to provision the cluster. if not setup_succeeded: print "cluster provisioning failed" elif not options['keep']: print "not keeping cluster" else: print "--keep specified, not destroying nodes." print ("To run acceptance tests against these nodes, " "set the following environment variables: ") environment_variables = get_trial_environment(cluster) for environment_variable in environment_variables: print "export {name}={value};".format( name=environment_variable, value=shell_quote( environment_variables[environment_variable]), ) reactor.removeSystemEventTrigger(cleanup_trigger_id) raise SystemExit(result)
def main(reactor, args, base_path, top_level): """ :param reactor: Reactor to use. :param list args: The arguments passed to the script. :param FilePath base_path: The executable being run. :param FilePath top_level: The top-level of the flocker repository. """ options = RunOptions(top_level=top_level) add_destination(eliot_output) try: options.parseOptions(args) except UsageError as e: sys.stderr.write("%s: %s\n" % (base_path.basename(), e)) raise SystemExit(1) runner = options.runner from flocker.common.script import eliot_logging_service log_file = open("%s.log" % base_path.basename(), "a") log_writer = eliot_logging_service(log_file=log_file, reactor=reactor, capture_stdout=False) log_writer.startService() reactor.addSystemEventTrigger('before', 'shutdown', log_writer.stopService) cluster = None try: cluster = yield runner.start_cluster(reactor) if options['distribution'] in ('centos-7', ): remote_logs_file = open("remote_logs.log", "a") for node in cluster.all_nodes: capture_journal(reactor, node.address, remote_logs_file) if not options["no-pull"]: yield perform( make_dispatcher(reactor), parallel([ run_remotely(username='******', address=node.address, commands=task_pull_docker_images()) for node in cluster.agent_nodes ]), ) result = yield run_tests(reactor=reactor, cluster=cluster, trial_args=options['trial-args']) except: result = 1 raise finally: # Unless the tests failed, and the user asked to keep the nodes, we # delete them. if not options['keep']: runner.stop_cluster(reactor) else: print "--keep specified, not destroying nodes." if cluster is None: print("Didn't finish creating the cluster.") else: print( "To run acceptance tests against these nodes, " "set the following environment variables: ") environment_variables = get_trial_environment(cluster) for environment_variable in environment_variables: print "export {name}={value};".format( name=environment_variable, value=shell_quote( environment_variables[environment_variable]), ) raise SystemExit(result)
def get_clb_contents(): """ Get Rackspace Cloud Load Balancer contents as list of `CLBNode`. CLB health monitor information is also returned as a pmap of :obj:`CLB` objects mapped on LB ID. :return: Effect of (``list`` of :obj:`CLBNode`, `pmap` of :obj:`CLB`) :rtype: :obj:`Effect` """ # If we get a CLBNotFoundError while fetching feeds, we should throw away # all nodes related to that load balancer, because we don't want to act on # data that we know is invalid/outdated (for example, if we can't fetch a # feed because CLB was deleted, we don't want to say that we have a node in # DRAINING with draining time of 0; we should just say that the node is # gone). def gone(r): return catch(CLBNotFoundError, lambda exc: r) lb_ids = [lb['id'] for lb in (yield _retry(get_clbs()))] node_reqs = [ _retry(get_clb_nodes(lb_id).on(error=gone([]))) for lb_id in lb_ids ] healthmon_reqs = [ _retry(get_clb_health_monitor(lb_id).on(error=gone(None))) for lb_id in lb_ids ] all_nodes_hms = yield parallel(node_reqs + healthmon_reqs) all_nodes, hms = all_nodes_hms[:len(lb_ids)], all_nodes_hms[len(lb_ids):] lb_nodes = { lb_id: [CLBNode.from_node_json(lb_id, node) for node in nodes] for lb_id, nodes in zip(lb_ids, all_nodes) } clbs = { str(lb_id): CLB(bool(health_mon)) for lb_id, health_mon in zip(lb_ids, hms) if health_mon is not None } draining = [ n for n in concat(lb_nodes.values()) if n.description.condition == CLBNodeCondition.DRAINING ] feeds = yield parallel([ _retry( get_clb_node_feed(n.description.lb_id, n.node_id).on(error=gone(None))) for n in draining ]) nodes_to_feeds = dict(zip(draining, feeds)) deleted_lbs = set([ node.description.lb_id for (node, feed) in nodes_to_feeds.items() if feed is None ]) def update_drained_at(node): feed = nodes_to_feeds.get(node) if node.description.lb_id in deleted_lbs: return None if feed is not None: node.drained_at = extract_clb_drained_at(feed) return node nodes = map(update_drained_at, concat(lb_nodes.values())) yield do_return((list(filter(bool, nodes)), pmap(keyfilter(lambda k: k not in deleted_lbs, clbs))))
def main(reactor, args, base_path, top_level): """ :param reactor: Reactor to use. :param list args: The arguments passed to the script. :param FilePath base_path: The executable being run. :param FilePath top_level: The top-level of the flocker repository. """ options = RunOptions(top_level=top_level) add_destination(eliot_output) try: options.parseOptions(args) except UsageError as e: sys.stderr.write("%s: %s\n" % (base_path.basename(), e)) raise SystemExit(1) runner = options.runner from flocker.common.script import eliot_logging_service log_writer = eliot_logging_service( destination=FileDestination( file=open("%s.log" % (base_path.basename(),), "a") ), reactor=reactor, capture_stdout=False) log_writer.startService() reactor.addSystemEventTrigger( 'before', 'shutdown', log_writer.stopService) cluster = None try: yield runner.ensure_keys(reactor) cluster = yield runner.start_cluster(reactor) if options['distribution'] in ('centos-7',): remote_logs_file = open("remote_logs.log", "a") for node in cluster.all_nodes: capture_journal(reactor, node.address, remote_logs_file) if not options["no-pull"]: yield perform( make_dispatcher(reactor), parallel([ run_remotely( username='******', address=node.address, commands=task_pull_docker_images() ) for node in cluster.agent_nodes ]), ) result = yield run_tests( reactor=reactor, cluster=cluster, trial_args=options['trial-args']) except: result = 1 raise finally: # Unless the tests failed, and the user asked to keep the nodes, we # delete them. if not options['keep']: runner.stop_cluster(reactor) else: print "--keep specified, not destroying nodes." if cluster is None: print ("Didn't finish creating the cluster.") else: print ("To run acceptance tests against these nodes, " "set the following environment variables: ") environment_variables = get_trial_environment(cluster) for environment_variable in environment_variables: print "export {name}={value};".format( name=environment_variable, value=shell_quote( environment_variables[environment_variable]), ) raise SystemExit(result)
def converge_all_groups(currently_converging, recently_converged, waiting, my_buckets, all_buckets, divergent_flags, build_timeout, interval, limited_retry_iterations, step_limits, converge_one_group=converge_one_group): """ Check for groups that need convergence and which match up to the buckets we've been allocated. :param Reference currently_converging: pset of currently converging groups :param Reference recently_converged: pmap of group ID to time last convergence finished :param Reference waiting: pmap of group ID to number of iterations already waited :param my_buckets: The buckets that should be checked for group IDs to converge on. :param all_buckets: The set of all buckets that can be checked for group IDs to converge on. ``my_buckets`` should be a subset of this. :param divergent_flags: divergent flags that were found in zookeeper. :param number build_timeout: number of seconds to wait for servers to be in building before it's is timed out and deleted :param number interval: number of seconds between attempts at convergence. Groups will not be converged if less than this amount of time has passed since the end of its last convergence. :param int limited_retry_iterations: number of iterations to wait for LIMITED_RETRY steps :param dict step_limits: Mapping of step class to number of executions allowed in a convergence cycle :param callable converge_one_group: function to use to converge a single group - to be used for test injection only """ group_infos = get_my_divergent_groups(my_buckets, all_buckets, divergent_flags) # filter out currently converging groups cc = yield currently_converging.read() group_infos = [info for info in group_infos if info['group_id'] not in cc] if not group_infos: return yield msg('converge-all-groups', group_infos=group_infos, currently_converging=list(cc)) @do def converge(tenant_id, group_id, dirty_flag): stat = yield Effect(GetStat(dirty_flag)) # If the node disappeared, ignore it. `stat` will be None here if the # divergent flag was discovered only after the group is removed from # currently_converging, but before the divergent flag is deleted, and # then the deletion happens, and then our GetStat happens. This # basically means it happens when one convergence is starting as # another one for the same group is ending. if stat is None: yield msg('converge-divergent-flag-disappeared', znode=dirty_flag) else: eff = converge_one_group(currently_converging, recently_converged, waiting, tenant_id, group_id, stat.version, build_timeout, limited_retry_iterations, step_limits) result = yield Effect(TenantScope(eff, tenant_id)) yield do_return(result) recent_groups = yield get_recently_converged_groups( recently_converged, interval) effs = [] for info in group_infos: tenant_id, group_id = info['tenant_id'], info['group_id'] if group_id in recent_groups: # Don't converge a group if it has recently been converged. continue eff = converge(tenant_id, group_id, info['dirty-flag']) effs.append( with_log(eff, tenant_id=tenant_id, scaling_group_id=group_id)) yield do_return(parallel(effs))
def converge_all_groups( currently_converging, recently_converged, waiting, my_buckets, all_buckets, divergent_flags, build_timeout, interval, limited_retry_iterations, step_limits, converge_one_group=converge_one_group): """ Check for groups that need convergence and which match up to the buckets we've been allocated. :param Reference currently_converging: pset of currently converging groups :param Reference recently_converged: pmap of group ID to time last convergence finished :param Reference waiting: pmap of group ID to number of iterations already waited :param my_buckets: The buckets that should be checked for group IDs to converge on. :param all_buckets: The set of all buckets that can be checked for group IDs to converge on. ``my_buckets`` should be a subset of this. :param divergent_flags: divergent flags that were found in zookeeper. :param number build_timeout: number of seconds to wait for servers to be in building before it's is timed out and deleted :param number interval: number of seconds between attempts at convergence. Groups will not be converged if less than this amount of time has passed since the end of its last convergence. :param int limited_retry_iterations: number of iterations to wait for LIMITED_RETRY steps :param dict step_limits: Mapping of step class to number of executions allowed in a convergence cycle :param callable converge_one_group: function to use to converge a single group - to be used for test injection only """ group_infos = get_my_divergent_groups( my_buckets, all_buckets, divergent_flags) # filter out currently converging groups cc = yield currently_converging.read() group_infos = [info for info in group_infos if info['group_id'] not in cc] if not group_infos: return yield msg('converge-all-groups', group_infos=group_infos, currently_converging=list(cc)) @do def converge(tenant_id, group_id, dirty_flag): stat = yield Effect(GetStat(dirty_flag)) # If the node disappeared, ignore it. `stat` will be None here if the # divergent flag was discovered only after the group is removed from # currently_converging, but before the divergent flag is deleted, and # then the deletion happens, and then our GetStat happens. This # basically means it happens when one convergence is starting as # another one for the same group is ending. if stat is None: yield msg('converge-divergent-flag-disappeared', znode=dirty_flag) else: eff = converge_one_group(currently_converging, recently_converged, waiting, tenant_id, group_id, stat.version, build_timeout, limited_retry_iterations, step_limits) result = yield Effect(TenantScope(eff, tenant_id)) yield do_return(result) recent_groups = yield get_recently_converged_groups(recently_converged, interval) effs = [] for info in group_infos: tenant_id, group_id = info['tenant_id'], info['group_id'] if group_id in recent_groups: # Don't converge a group if it has recently been converged. continue eff = converge(tenant_id, group_id, info['dirty-flag']) effs.append( with_log(eff, tenant_id=tenant_id, scaling_group_id=group_id)) yield do_return(parallel(effs))