def install_dcos_from_path( self, dcos_installer: Path, dcos_config: Dict[str, Any], ip_detect_path: Path, output: Output, files_to_copy_to_genconf_dir: Iterable[Tuple[Path, Path]] = (), ) -> None: """ Install DC/OS from a given installer with a bootstrap node. Args: dcos_installer: The ``Path`` to an installer to install DC/OS from. dcos_config: The DC/OS configuration to use. ip_detect_path: The path to an ``ip-detect`` script to be used during the DC/OS installation. output: What happens with stdout and stderr. files_to_copy_to_genconf_dir: Pairs of host paths to paths on the installer node. This must be empty as it is not currently supported. """ cluster = Cluster.from_nodes( masters=self.masters, agents=self.agents, public_agents=self.public_agents, ) cluster.install_dcos_from_path( dcos_installer=dcos_installer, dcos_config=dcos_config, ip_detect_path=ip_detect_path, files_to_copy_to_genconf_dir=files_to_copy_to_genconf_dir, output=output, )
def install_dcos_from_url( self, dcos_installer: str, dcos_config: Dict[str, Any], ip_detect_path: Path, output: Output, files_to_copy_to_genconf_dir: Iterable[Tuple[Path, Path]], ) -> None: """ Install DC/OS from a URL with a bootstrap node. Args: dcos_installer: The URL string to an installer to install DC/OS from. dcos_config: The DC/OS configuration to use. ip_detect_path: The ``ip-detect`` script that is used for installing DC/OS. output: What happens with stdout and stderr. files_to_copy_to_genconf_dir: Pairs of host paths to paths on the installer node. These are files to copy from the host to the installer node before installing DC/OS. """ cluster = Cluster.from_nodes( masters=self.masters, agents=self.agents, public_agents=self.public_agents, ) cluster.install_dcos_from_url( dcos_installer=dcos_installer, dcos_config=dcos_config, ip_detect_path=ip_detect_path, output=output, files_to_copy_to_genconf_dir=files_to_copy_to_genconf_dir, )
def test_install_dcos_from_path( self, oss_installer: Path, cluster_backend: ClusterBackend, ) -> None: """ DC/OS can be installed on an existing cluster from a path. """ with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as original_cluster: cluster = Cluster.from_nodes( masters=original_cluster.masters, agents=original_cluster.agents, public_agents=original_cluster.public_agents, ) cluster.install_dcos_from_path( dcos_installer=oss_installer, dcos_config=original_cluster.base_config, ip_detect_path=cluster_backend.ip_detect_path, ) cluster.wait_for_dcos_oss() for node in { *cluster.masters, *cluster.agents, *cluster.public_agents, }: build = node.dcos_build_info() assert build.version.startswith('2.') assert build.commit assert build.variant == DCOSVariant.OSS
def test_install_dcos_from_path( self, oss_artifact: Path, cluster_backend: ClusterBackend, ) -> None: """ DC/OS can be installed on an existing cluster from a path. """ with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as original_cluster: cluster = Cluster.from_nodes( masters=original_cluster.masters, agents=original_cluster.agents, public_agents=original_cluster.public_agents, ) cluster.install_dcos_from_path( build_artifact=oss_artifact, dcos_config=original_cluster.base_config, ) cluster.wait_for_dcos_oss()
def test_install_dcos_from_url( self, oss_installer_url: str, cluster_backend: ClusterBackend, ) -> None: """ DC/OS can be installed on an existing cluster from a URL. """ with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as original_cluster: cluster = Cluster.from_nodes( masters=original_cluster.masters, agents=original_cluster.agents, public_agents=original_cluster.public_agents, ) cluster.install_dcos_from_url( dcos_installer=oss_installer_url, dcos_config=original_cluster.base_config, ip_detect_path=cluster_backend.ip_detect_path, ) cluster.wait_for_dcos_oss()
def test_install_dcos( self, oss_artifact: Path, oss_artifact_url: str, cluster_backend: ClusterBackend, ) -> None: """ If a user attempts to install DC/OS on is called on a `Cluster` created from existing nodes, a `NotImplementedError` is raised. """ with Cluster( cluster_backend=cluster_backend, masters=1, agents=0, public_agents=0, ) as cluster: cluster = Cluster.from_nodes( masters=cluster.masters, agents=cluster.agents, public_agents=cluster.public_agents, default_ssh_user=cluster_backend.default_ssh_user, ) with pytest.raises(NotImplementedError): cluster.install_dcos_from_url(build_artifact=oss_artifact_url) with pytest.raises(NotImplementedError): cluster.install_dcos_from_path(build_artifact=oss_artifact)
def install_dcos_from_url( self, dcos_installer: str, dcos_config: Dict[str, Any], ip_detect_path: Path, output: Output, files_to_copy_to_genconf_dir: Iterable[Tuple[Path, Path]], ) -> None: """ Install DC/OS from an installer passed as an URL string. Args: dcos_installer: The URL string to an installer to install DC/OS from. dcos_config: The DC/OS configuration to use. ip_detect_path: The ``ip-detect`` script that is used for installing DC/OS. output: What happens with stdout and stderr. files_to_copy_to_genconf_dir: Pairs of host paths to paths on the installer node. This must be empty as it is not currently supported. """ cluster = Cluster.from_nodes( masters=self.masters, agents=self.agents, public_agents=self.public_agents, ) cluster.install_dcos_from_url( dcos_installer=dcos_installer, dcos_config=dcos_config, ip_detect_path=ip_detect_path, output=output, files_to_copy_to_genconf_dir=files_to_copy_to_genconf_dir, )
def cluster(self) -> Cluster: """ Return a ``Cluster`` constructed from the containers. """ return Cluster.from_nodes( masters=set(map(self.to_node, self.masters)), agents=set(map(self.to_node, self.agents)), public_agents=set(map(self.to_node, self.public_agents)), )
def test_cluster_from_nodes(self, cluster_backend: ClusterBackend) -> None: """ It is possible to create a cluster from existing nodes, but not destroy it. """ cluster = Cluster( cluster_backend=cluster_backend, masters=1, agents=1, public_agents=1, ) (master, ) = cluster.masters (agent, ) = cluster.agents (public_agent, ) = cluster.public_agents with Cluster.from_nodes( masters=cluster.masters, agents=cluster.agents, public_agents=cluster.public_agents, default_ssh_user=cluster_backend.default_ssh_user, ) as duplicate_cluster: (duplicate_master, ) = duplicate_cluster.masters (duplicate_agent, ) = duplicate_cluster.agents (duplicate_public_agent, ) = duplicate_cluster.public_agents duplicate_master.run( args=['touch', 'example_master_file'], user=duplicate_cluster.default_ssh_user, ) duplicate_agent.run( args=['touch', 'example_agent_file'], user=duplicate_cluster.default_ssh_user, ) duplicate_public_agent.run( args=['touch', 'example_public_agent_file'], user=duplicate_cluster.default_ssh_user, ) master.run( args=['test', '-f', 'example_master_file'], user=duplicate_cluster.default_ssh_user, ) agent.run( args=['test', '-f', 'example_agent_file'], user=duplicate_cluster.default_ssh_user, ) public_agent.run( args=['test', '-f', 'example_public_agent_file'], user=duplicate_cluster.default_ssh_user, ) with pytest.raises(NotImplementedError): duplicate_cluster.destroy() cluster.destroy()
def cluster(self) -> Cluster: """ Return a ``Cluster`` constructed from the Vms. """ vm_names = self._vm_names masters = [name for name in vm_names if '-master-' in name] agents = [name for name in vm_names if '-agent-' in name] public_agents = [name for name in vm_names if '-public-agent-' in name] return Cluster.from_nodes( masters=set(map(self._to_node, masters)), agents=set(map(self._to_node, agents)), public_agents=set(map(self._to_node, public_agents)), )
def test_cluster_from_nodes(self, cluster_backend: ClusterBackend) -> None: """ It is possible to create a cluster from existing nodes, but not destroy it, or any nodes in it. """ cluster = Cluster( cluster_backend=cluster_backend, masters=1, agents=1, public_agents=1, ) (master, ) = cluster.masters (agent, ) = cluster.agents (public_agent, ) = cluster.public_agents with Cluster.from_nodes( masters=cluster.masters, agents=cluster.agents, public_agents=cluster.public_agents, ) as duplicate_cluster: (duplicate_master, ) = duplicate_cluster.masters (duplicate_agent, ) = duplicate_cluster.agents (duplicate_public_agent, ) = duplicate_cluster.public_agents assert 'master_list' in duplicate_cluster.base_config assert 'agent_list' in duplicate_cluster.base_config assert 'public_agent_list' in duplicate_cluster.base_config duplicate_master.run(args=['touch', 'example_master_file']) duplicate_agent.run(args=['touch', 'example_agent_file']) duplicate_public_agent.run( args=['touch', 'example_public_agent_file'], ) master.run(args=['test', '-f', 'example_master_file']) agent.run(args=['test', '-f', 'example_agent_file']) public_agent.run(args=['test', '-f', 'example_public_agent_file']) with pytest.raises(NotImplementedError): duplicate_cluster.destroy() with pytest.raises(NotImplementedError): duplicate_cluster.destroy_node(node=duplicate_master) cluster.destroy()
def install_dcos_from_url( self, dcos_installer: str, dcos_config: Dict[str, Any], ip_detect_path: Path, output: Output, files_to_copy_to_genconf_dir: Iterable[Tuple[Path, Path]], ) -> None: """ Install DC/OS from a URL. Args: dcos_installer: The URL string to an installer to install DC/OS from. dcos_config: The DC/OS configuration to use. ip_detect_path: The path to an ``ip-detect`` script to be used during the DC/OS installation. output: What happens with stdout and stderr. files_to_copy_to_genconf_dir: Pairs of host paths to paths on the installer node. These are files to copy from the host to the installer node before installing DC/OS. """ new_ip_detect_given = bool(ip_detect_path != self._ip_detect_path) if new_ip_detect_given or files_to_copy_to_genconf_dir: cluster = Cluster.from_nodes( masters=self.masters, agents=self.agents, public_agents=self.public_agents, ) cluster.install_dcos_from_url( dcos_installer=dcos_installer, dcos_config=dcos_config, ip_detect_path=ip_detect_path, output=output, files_to_copy_to_genconf_dir=files_to_copy_to_genconf_dir, ) return # In order to install DC/OS with the preliminary dcos-launch # config the ``dcos_installer`` URL is overwritten. self.launcher.config['installer_url'] = dcos_installer self.launcher.config['dcos_config'] = dcos_config self.launcher.install_dcos()
def test_replace_all_static( artifact_path: Path, docker_network_three_available_addresses: Network, tmp_path: Path, request: SubRequest, log_dir: Path, ) -> None: """ In a cluster with an Exhibitor backend consisting of a static ZooKeeper ensemble, after removing one master, and then adding another master with the same IP address, the cluster will get to a healthy state. This is repeated until all masters in the original cluster have been replaced. The purpose of this test is to assert that the ``node-poststart`` procedure correctly prevents a master node replacement from being performed too quickly. A new master node should only become part of the cluster if there are no more underreplicated ranges reported by CockroachDB. Permanent CockroachDB data loss and a potential breakage of DC/OS occurs when a second master node is taken down for replacement while CockroachDB is recovering and there are still underreplicated ranges due to a recent other master node replacement. """ docker_backend = Docker(network=docker_network_three_available_addresses) with Cluster( cluster_backend=docker_backend, # Allocate all 3 available IP addresses in the subnet. masters=3, agents=0, public_agents=0, ) as original_cluster: master = next(iter(original_cluster.masters)) result = master.run( args=[ 'ifconfig', '|', 'grep', '-B1', str(master.public_ip_address), '|', 'grep', '-o', '"^\w*"', ], shell=True, ) interface = result.stdout.strip().decode() ip_detect_contents = textwrap.dedent( """\ #!/bin/bash -e if [ -f /sbin/ip ]; then IP_CMD=/sbin/ip else IP_CMD=/bin/ip fi $IP_CMD -4 -o addr show dev {interface} | awk '{{split($4,a,"/");print a[1]}}' """.format(interface=interface), ) ip_detect_path = tmp_path / 'ip-detect' ip_detect_path.write_text(data=ip_detect_contents) static_config = { 'master_discovery': 'static', 'master_list': [ str(master.private_ip_address) for master in original_cluster.masters ], } dcos_config = { **original_cluster.base_config, **static_config, } original_cluster.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, ip_detect_path=ip_detect_path, ) wait_for_dcos_oss( cluster=original_cluster, request=request, log_dir=log_dir, ) current_cluster = original_cluster tmp_clusters = set() original_masters = original_cluster.masters try: for master_to_be_replaced in original_masters: # Destroy a master and free one IP address. current_cluster.destroy_node(node=master_to_be_replaced) temporary_cluster = Cluster( cluster_backend=docker_backend, # Allocate one container with the now free IP address. masters=1, agents=0, public_agents=0, ) tmp_clusters.add(temporary_cluster) # Install a new master on a new container with the same IP address. (new_master, ) = temporary_cluster.masters new_master.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, role=Role.MASTER, ip_detect_path=ip_detect_path, ) # Form a new cluster with the newly create master node. new_cluster = Cluster.from_nodes( masters=current_cluster.masters.add(new_master), agents=current_cluster.agents, public_agents=current_cluster.public_agents, ) # The `wait_for_dcos_oss` function waits until the new master has # joined the cluster and all masters are healthy. Without the # cockroachdb check, this succeeds before all cockroachdb ranges # have finished replicating to the new master. That meant that the # next master would be replaced too quickly, while it had data that # was not present elsewhere in the cluster. This lead to # irrecoverable dataloss. This function waits until the # master node is "healthy". This is a requirement for replacing the # next master node. # # We don't call the cockroachdb ranges check directly as the # purpose of this test is to ensure that when an operator follows # our documented procedure for replacing a master node multiple # times in a row (e.g. during a cluster upgrade) then the cluster # remains healthy throughout and afterwards. # # If we called the check directly here, we would be # sure the check is being called, but we would not be sure that # "wait_for_dcos_oss", i.e., the standard procedure for determining # whether a node is healthy, is sufficient to prevent the cluster # from breaking. # # We perform this check after every master is replaced, as that is # what we tell operators to do: "After installing the new master # node, wait until it becomes healthy before proceeding to the # next." # # The procedure for replacing multiple masters is documented here: # https://docs.mesosphere.com/1.12/installing/production/upgrading/#dcos-masters wait_for_dcos_oss( cluster=new_cluster, request=request, log_dir=log_dir, ) # Use the new cluster object in the next replacement iteration. current_cluster = new_cluster finally: for cluster in tmp_clusters: cluster.destroy()
def run_tests(e2e_backend, installer_url, dcos_license, dcos_url, admin_username, admin_password, ssh_user, ssh_key_path): os.environ["CLI_TEST_SSH_USER"] = ssh_user os.environ["CLI_TEST_MASTER_PROXY"] = "1" os.environ["CLI_TEST_SSH_KEY_PATH"] = ssh_key_path # extra dcos_config (for dcos_launch and dcos_docker backends) extra_config = { 'superuser_username': admin_username, 'superuser_password_hash': sha512_crypt.hash(admin_password), 'fault_domain_enabled': False, 'license_key_contents': dcos_license, } if e2e_backend == 'dcos_launch': cluster_backend = AWS() with Cluster(cluster_backend=cluster_backend, agents=1) as cluster: dcos_config = {**cluster.base_config, **extra_config} cluster.install_dcos_from_url( build_artifact=installer_url, dcos_config=dcos_config, log_output_live=True, ) os.environ["CLI_TEST_SSH_KEY_PATH"] = str( cluster._cluster._ssh_key_path) _run_tests(cluster, admin_username, admin_password) elif e2e_backend == 'dcos_docker': dcos_ee_installer_filename = 'dcos_generate_config.ee.sh' dcos_ee_installer_path = Path.cwd() / Path(dcos_ee_installer_filename) if not dcos_ee_installer_path.exists(): urllib.request.urlretrieve(installer_url, dcos_ee_installer_filename) with Cluster(cluster_backend=Docker(), agents=1) as cluster: dcos_config = {**cluster.base_config, **extra_config} cluster.install_dcos_from_path( build_artifact=dcos_ee_installer_path, dcos_config=dcos_config, log_output_live=True, ) _run_tests(cluster, admin_username, admin_password) elif e2e_backend == 'existing': try: dcos_ip = IPv4Address(dcos_url) except ValueError: parsed_dcos_url = urlparse(dcos_url) dcos_hostname = parsed_dcos_url.hostname dcos_ip = IPv4Address(socket.gethostbyname(dcos_hostname)) masters = set([ Node( public_ip_address=dcos_ip, private_ip_address=dcos_ip, ssh_key_path=Path(ssh_key_path), default_ssh_user=ssh_user, ) ]) cluster = Cluster.from_nodes( masters=masters, agents=set(), public_agents=set(), ) _run_tests(cluster, admin_username, admin_password)
def test_replace_all_static( artifact_path: Path, docker_network_three_available_addresses: Network, tmp_path: Path, request: SubRequest, log_dir: Path, ) -> None: """ In a cluster with an Exhibitor backend consisting of a static ZooKeeper ensemble, after removing one master, and then adding another master with the same IP address, the cluster will get to a healthy state. This is repeated until all masters in the original cluster have been replaced. The purpose of this test is to assert that the ``node-poststart`` procedure correctly prevents a master node replacement from being performed too quickly. A new master node should only become part of the cluster if there are no more underreplicated ranges reported by CockroachDB. Permanent CockroachDB data loss and a potential breakage of DC/OS occurs when a second master node is taken down for replacement while CockroachDB is recovering and there are still underreplicated ranges due to a recent other master node replacement. """ docker_backend = Docker(network=docker_network_three_available_addresses) with Cluster( cluster_backend=docker_backend, # Allocate all 3 available IP addresses in the subnet. masters=3, agents=0, public_agents=0, ) as original_cluster: master = next(iter(original_cluster.masters)) result = master.run( args=[ 'ifconfig', '|', 'grep', '-B1', str(master.public_ip_address), '|', 'grep', '-o', '"^\w*"', ], output=Output.LOG_AND_CAPTURE, shell=True, ) interface = result.stdout.strip().decode() ip_detect_contents = textwrap.dedent( """\ #!/bin/bash -e if [ -f /sbin/ip ]; then IP_CMD=/sbin/ip else IP_CMD=/bin/ip fi $IP_CMD -4 -o addr show dev {interface} | awk '{{split($4,a,"/");print a[1]}}' """.format(interface=interface), ) ip_detect_path = tmp_path / 'ip-detect' ip_detect_path.write_text(data=ip_detect_contents) static_config = { 'master_discovery': 'static', 'master_list': [str(master.private_ip_address) for master in original_cluster.masters], } dcos_config = { **original_cluster.base_config, **static_config, } original_cluster.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, ip_detect_path=ip_detect_path, ) wait_for_dcos_oss( cluster=original_cluster, request=request, log_dir=log_dir, ) current_cluster = original_cluster tmp_clusters = set() original_masters = original_cluster.masters try: for master_to_be_replaced in original_masters: # Destroy a master and free one IP address. original_cluster.destroy_node(node=master_to_be_replaced) temporary_cluster = Cluster( cluster_backend=docker_backend, # Allocate one container with the now free IP address. masters=1, agents=0, public_agents=0, ) tmp_clusters.add(temporary_cluster) # Install a new master on a new container with the same IP address. (new_master, ) = temporary_cluster.masters new_master.install_dcos_from_path( dcos_installer=artifact_path, dcos_config=dcos_config, role=Role.MASTER, ip_detect_path=ip_detect_path, ) # Form a new cluster with the newly create master node. new_cluster = Cluster.from_nodes( masters=current_cluster.masters.union({new_master}), agents=current_cluster.agents, public_agents=current_cluster.public_agents, ) # The `wait_for_dcos_oss` function waits until the new master has # joined the cluster and all masters are healthy. Without the # cockroachdb check, this succeeds before all cockroachdb ranges # have finished replicating to the new master. That meant that the # next master would be replaced too quickly, while it had data that # was not present elsewhere in the cluster. This lead to # irrecoverable dataloss. This function waits until the # master node is "healthy". This is a requirement for replacing the # next master node. # # We don't call the cockroachdb ranges check directly as the # purpose of this test is to ensure that when an operator follows # our documented procedure for replacing a master node multiple # times in a row (e.g. during a cluster upgrade) then the cluster # remains healthy throughout and afterwards. # # If we called the check directly here, we would be # sure the check is being called, but we would not be sure that # "wait_for_dcos_oss", i.e., the standard procedure for determining # whether a node is healthy, is sufficient to prevent the cluster # from breaking. # # We perform this check after every master is replaced, as that is # what we tell operators to do: "After installing the new master # node, wait until it becomes healthy before proceeding to the # next." # # The procedure for replacing multiple masters is documented here: # https://docs.mesosphere.com/1.12/installing/production/upgrading/#dcos-masters wait_for_dcos_oss( cluster=new_cluster, request=request, log_dir=log_dir, ) # Use the new cluster object in the next replacement iteration. current_cluster = new_cluster finally: for cluster in tmp_clusters: cluster.destroy()