def set_ca_cert(self): """ If security is permissive or strict, and the API session is not configured with verify=False, then the custom CA cert for the desired cluster must be attached to the session, which this method will do """ log.info('Attempt to get CA bundle via Admin Router') r = self.get('/ca/dcos-ca.crt', verify=False) r.raise_for_status() self.session.verify = helpers.session_tempfile(r.content)
def mount_volumes(): """ Will create 200MB partions on clusters launched by dcos-launch """ script = """ #!/bin/bash sudo systemctl stop dcos-mesos-slave.service sudo rm -f /var/lib/dcos/mesos-resources sudo rm -f /var/lib/mesos/slave/meta/slaves/latest """ for i in range(2): script += """ sudo mkdir -p /dcos/volume{idx} sudo dd if=/dev/zero of=/root/volume{idx}.img bs=1M count={size} sudo losetup /dev/loop{idx} /root/volume{idx}.img sudo mkfs -t ext4 /dev/loop{idx} sudo losetup -d /dev/loop{idx} echo "/root/volume{idx}.img /dcos/volume{idx} auto loop 0 2" | sudo tee -a /etc/fstab sudo mount /dcos/volume{idx} """.format(idx=i, size=200) script += """ sudo systemctl restart dcos-mesos-slave.service """ cluster_info_path = os.getenv('CLUSTER_INFO_PATH', 'cluster_info.json') if not os.path.exists(cluster_info_path): raise Exception('No cluster info to work with!') cluster_info_json = json.load(open(cluster_info_path)) launcher = dcos_launch.get_launcher(cluster_info_json) description = launcher.describe() ssh = launcher.get_ssh_client() with ssh.tunnel(description['masters'][0]['public_ip']) as t: t.copy_file(helpers.session_tempfile(ssh.key), 'ssh_key') t.copy_file(helpers.session_tempfile(script), 'volume_script.sh') t.command(['chmod', '600', 'ssh_key']) ssh_command = ['ssh', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS scp_command = ['scp', '-i', 'ssh_key'] + ssh_client.SHARED_SSH_OPTS for private_agent in description['private_agents']: target = '{}@{}'.format(ssh.user, private_agent['private_ip']) t.command(scp_command + ['volume_script.sh', target + ':~/volume_script.sh']) t.command(ssh_command + [target, 'bash', 'volume_script.sh']) # nasty hack until we add a better post-flight time.sleep(60)
def tunnel_args(sshd_manager, tmpdir): with sshd_manager.run(1) as sshd_ports: yield { 'user': getpass.getuser(), 'control_path': str(tmpdir.join('x')), # use as short a name as possible 'key_path': helpers.session_tempfile(sshd_manager.key), 'host': '127.0.0.1', 'port': sshd_ports[0] }
def upgraded_dcos(dcos_api_session, launcher, setup_workload, onprem_cluster, is_enterprise): """ This test is intended to test upgrades between versions so use the same config as the original launch """ # Check for previous installation artifacts first bootstrap_host = onprem_cluster.bootstrap_host.public_ip upgrade.reset_bootstrap_host(onprem_cluster.ssh_client, bootstrap_host) upgrade_config_overrides = dict() if 'TEST_UPGRADE_CONFIG_PATH' in os.environ: with open(os.environ['TEST_UPGRADE_CONFIG_PATH'], 'r') as f: upgrade_config_overrides = yaml.load(f.read()) upgrade_config = copy.copy(launcher.config['dcos_config']) upgrade_config.update({ 'cluster_name': 'My Upgraded DC/OS', 'ssh_user': onprem_cluster.ssh_client.user, # can probably drop this field 'bootstrap_url': 'http://' + onprem_cluster.bootstrap_host.private_ip, 'master_list': [h.private_ip for h in onprem_cluster.masters], 'agent_list': [h.private_ip for h in onprem_cluster.private_agents], 'public_agent_list': [h.private_ip for h in onprem_cluster.public_agents] }) upgrade_config.update(upgrade_config_overrides) # if it was a ZK-backed install, make sure ZK is still running if upgrade_config.get('exhibitor_storage_backend') == 'zookeeper': upgrade_config[ 'exhibitor_zk_hosts'] = onprem_cluster.start_bootstrap_zk() # if IP detect public was not present, go ahead an inject it if 'ip_detect_public_contents' not in upgrade_config: upgrade_config['ip_detect_public_contents'] = yaml.dump( pkg_resources.resource_string('dcos_test_utils', 'ip-detect/aws_public.sh').decode()) bootstrap_home = onprem_cluster.ssh_client.get_home_dir(bootstrap_host) genconf_dir = os.path.join(bootstrap_home, 'genconf') with onprem_cluster.ssh_client.tunnel(bootstrap_host) as tunnel: log.info('Setting up upgrade config on bootstrap host') tunnel.command(['mkdir', genconf_dir]) # transfer the config file tunnel.copy_file( helpers.session_tempfile(yaml.dump(upgrade_config).encode()), os.path.join(bootstrap_home, 'genconf/config.yaml')) # FIXME: we dont need the ssh key when the upgrade isnt being orchestratd tunnel.copy_file( helpers.session_tempfile(onprem_cluster.ssh_client.key.encode()), os.path.join(bootstrap_home, 'genconf/ssh_key')) tunnel.command( ['chmod', '600', os.path.join(bootstrap_home, 'genconf/ssh_key')]) # Move the ip-detect script to the expected default path # FIXME: can we just send the contents in the config and skip this? tunnel.copy_file( pkg_resources.resource_filename('dcos_test_utils', 'ip-detect/aws.sh'), os.path.join(bootstrap_home, 'genconf/ip-detect')) # API object may need to be updated upgrade_session = make_dcos_api_session( onprem_cluster, launcher, is_enterprise, upgrade_config_overrides.get('security')) # use the Auth session from the previous API session upgrade_session.session.auth = dcos_api_session.session.auth # do the actual upgrade upgrade.upgrade_dcos(upgrade_session, onprem_cluster, dcos_api_session.get_version(), os.environ['TEST_UPGRADE_INSTALLER_URL'], os.environ['TEST_UPGRADE_USE_CHECKS'] == 'true') # this can be set after the fact because the upgrade metrics snapshot # endpoint is polled with verify=False if upgrade_session.default_url.scheme == 'https': upgrade_session.set_ca_cert() # Now Re-auth with the new session upgrade_session.wait_for_dcos() return upgrade_session
def temp_ssh_key(key: str) -> str: """ Dumps an SSH key string to a temp file that will be deleted at session close and returns the path """ key_path = helpers.session_tempfile(key) os.chmod(str(key_path), stat.S_IREAD | stat.S_IWRITE) return key_path
def test_installer_cli(onprem_cluster, onprem_launcher): """ This test will step through the CLI install proceder for on-prem DC/OS This test has an environment variable switch: TEST_INSTALL_PREREQS If set to 'true', the --install-prereqs option on the installer will be run and if it rasises an error code, the test will fail. """ host = onprem_cluster.bootstrap_host.public_ip ssh = onprem_launcher.get_ssh_client() log.info('Verifying SSH-connectivity to cluster') for h in onprem_cluster.hosts: ssh.wait_for_ssh_connection(h.public_ip) log.info('Setting up installer host') home_dir = ssh.get_home_dir(host) ssh.add_ssh_user_to_docker_users(host) genconf_dir = os.path.join(home_dir, 'genconf') ssh.command(host, ['mkdir', '-p', genconf_dir]) installer_path = os.path.join(home_dir, 'dcos_generate_config.sh') onprem.download_dcos_installer( ssh, host, installer_path, onprem_launcher.config['installer_url']) cli_installer = DcosCliInstaller(host, installer_path, ssh) log.info('Installer is ready for use!') # Start with minimal, default config, and then inject user settings test_config = { 'cluster_name': 'SSH Installed DC/OS', 'bootstrap_url': 'file:///opt/dcos_install_tmp', 'master_discovery': 'static', 'master_list': [m.private_ip for m in onprem_cluster.masters], 'ssh_user': onprem_launcher.config['ssh_user'], 'agent_list': [a.private_ip for a in onprem_cluster.private_agents], 'platform': 'aws', 'rexray_config_preset': 'aws', 'public_agent_list': [a.private_ip for a in onprem_cluster.public_agents], 'exhibitor_storage_backend': 'static'} test_config.update(onprem_launcher.config['dcos_config']) # explicitly transfer the files to be in the designated paths on the host log.info('Transfering config.yaml') cli_installer.copy_to_host( helpers.session_tempfile( yaml.dump(test_config).encode()), os.path.join(genconf_dir, 'config.yaml')) log.info('Transfering ip-detect script') ip_detect_script = pkg_resources.resource_string('dcos_test_utils', 'ip-detect/aws.sh') cli_installer.copy_to_host( helpers.session_tempfile(ip_detect_script), os.path.join(genconf_dir, 'ip-detect')) log.info('Transferring deployment SSH key') cli_installer.copy_to_host( helpers.session_tempfile( onprem_launcher.config['ssh_private_key'].encode()), os.path.join(genconf_dir, 'ssh_key')) cli_installer.ssh_command(['chmod', '600', os.path.join(genconf_dir, 'ssh_key')]) log.info('Running installation procedure') cli_installer.genconf() if os.environ['TEST_INSTALL_PREREQS'] == 'true': cli_installer.install_prereqs() cli_installer.preflight() cli_installer.deploy() cli_installer.postflight()
def set_ca_cert(self): log.info('Attempt to get CA bundle via Admin Router') r = self.get('ca/dcos-ca.crt', verify=False) r.raise_for_status() self.session.verify = helpers.session_tempfile(r.content)
def mount_volumes(): """ Will create 200MB partions on clusters launched by dcos-launch """ volume_script = """#!/bin/bash set -e if [ {dcos_mounts} ]; then echo 'Volumes already exist, exiting early' exit 0 fi echo 'Stopping agent and clearing state...' systemctl stop dcos-mesos-slave.service cat /var/lib/dcos/mesos-resources || echo 'No resources file found' ls -l /var/lib/mesos/slave/meta/slaves/latest || echo 'No latest agent symlink found' rm -f /var/lib/dcos/mesos-resources rm -f /var/lib/mesos/slave/meta/slaves/latest losetup -a """.format(dcos_mounts=" -a ".join( ["-e /dcos/volume{}".format(i) for i in range(MOUNT_VOLUME_COUNT)])) for i in range(MOUNT_VOLUME_COUNT): volume_script += """ if [ ! -e {loop_file} ]; then echo 'Creating loopback device {loop_dev}...' dd if=/dev/zero of={loop_file} bs=1M count={size_mb} losetup {loop_dev} {loop_file} mkfs -t ext4 {loop_dev} losetup -d {loop_dev} fi if [ ! -e {dcos_mount} ]; then echo 'Creating loopback volume {dcos_mount}...' mkdir -p {dcos_mount} echo \"{loop_file} {dcos_mount} auto loop 0 2\" | tee -a /etc/fstab mount {dcos_mount} fi """.format( size_mb=MOUNT_VOLUME_SIZE_MB, dcos_mount="/dcos/volume{}".format(i), loop_dev="/dev/loop{}".format(i), loop_file="/root/volume{}.img".format(i), ) volume_script += """ echo 'Restarting agent...' systemctl restart dcos-mesos-slave.service""" cluster_info_path = os.getenv("CLUSTER_INFO_PATH", "cluster_info.json") if not os.path.exists(cluster_info_path): raise Exception("No cluster info to work with!") cluster_info_json = json.load(open(cluster_info_path)) launcher = dcos_launch.get_launcher(cluster_info_json) description = launcher.describe() ssh = launcher.get_ssh_client() with ssh.tunnel(description["masters"][0]["public_ip"]) as t: t.copy_file(helpers.session_tempfile(ssh.key), "ssh_key") t.copy_file(helpers.session_tempfile(volume_script), "volume_script.sh") t.command(["chmod", "600", "ssh_key"]) ssh_command = ["ssh", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS scp_command = ["scp", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS for private_agent in description["private_agents"]: target = "{}@{}".format(ssh.user, private_agent["private_ip"]) t.command(scp_command + ["volume_script.sh", target + ":~/volume_script.sh"]) t.command(ssh_command + [target, "sudo", "bash", "volume_script.sh"]) # nasty hack until we add a better post-flight time.sleep(60)
def mount_volumes(): """ Will create 200MB partions on clusters launched by dcos-launch """ volume_script = """#!/bin/bash set -e if [ {dcos_mounts} ]; then echo 'Volumes already exist, exiting early' exit 0 fi echo 'Stopping agent and clearing state...' systemctl stop dcos-mesos-slave.service cat /var/lib/dcos/mesos-resources || echo 'No resources file found' ls -l /var/lib/mesos/slave/meta/slaves/latest || echo 'No latest agent symlink found' rm -f /var/lib/dcos/mesos-resources rm -f /var/lib/mesos/slave/meta/slaves/latest losetup -a """.format(dcos_mounts=" -a ".join([ "-e /dcos/volume{}".format(i) for i, _ in enumerate(MOUNT_VOLUME_PROFILES) ])) for i, p in enumerate(MOUNT_VOLUME_PROFILES): volume_script += """ if [ ! -e {loop_file} ]; then echo 'Creating loopback device {loop_dev}...' dd if=/dev/zero of={loop_file} bs=1M count={size_mb} losetup {loop_dev} {loop_file} mkfs -t {fs_type} {loop_dev} losetup -d {loop_dev} fi if [ ! -e {dcos_mount} ]; then echo 'Creating loopback volume {dcos_mount}...' mkdir -p {dcos_mount} echo \"{loop_file} {dcos_mount} auto loop 0 2\" | tee -a /etc/fstab mount {dcos_mount} fi """.format(size_mb=MOUNT_VOLUME_SIZE_MB, dcos_mount="/dcos/volume{}".format(i), loop_dev="/dev/loop{}".format(i), loop_file="/root/volume{}.img".format(i), fs_type=p or "ext4") # To create profile mount volumes, we manually run `make_disk_resources.py` # to generate disk resources, then parse the result and set the # `disk.source.profile` field for each profile mount volume. volume_script += """ echo 'Updating disk resources...' export MESOS_WORK_DIR MESOS_RESOURCES eval $(sed -E "s/^([A-Z_]+)=(.*)$/\\1='\\2'/" /opt/mesosphere/etc/mesos-slave-common) # Set up `MESOS_WORK_DIR`. eval $(sed -E "s/^([A-Z_]+)=(.*)$/\\1='\\2'/" /opt/mesosphere/etc/mesos-slave) # Set up `MESOS_RESOURCES`. source /opt/mesosphere/etc/mesos-slave-common /opt/mesosphere/bin/make_disk_resources.py /var/lib/dcos/mesos-resources source /var/lib/dcos/mesos-resources /opt/mesosphere/bin/python -c " import json; import os; profiles = {profiles} resources = json.loads(os.environ['MESOS_RESOURCES']) for r in resources: try: disk_source = r['disk']['source'] disk_source['profile'] = profiles[disk_source['mount']['root']] except KeyError: pass print('MESOS_RESOURCES=\\'' + json.dumps(resources) + '\\'') " > /var/lib/dcos/mesos-resources echo 'Restarting agent...' systemctl restart dcos-mesos-slave.service """.format( profiles={ "/dcos/volume{}".format(i): p for i, p in enumerate(MOUNT_VOLUME_PROFILES) if p }) cluster_info_path = os.getenv("CLUSTER_INFO_PATH", "cluster_info.json") if not os.path.exists(cluster_info_path): raise Exception("No cluster info to work with!") cluster_info_json = json.load(open(cluster_info_path)) launcher = dcos_launch.get_launcher(cluster_info_json) description = launcher.describe() ssh = launcher.get_ssh_client() with ssh.tunnel(description["masters"][0]["public_ip"]) as t: t.copy_file(helpers.session_tempfile(ssh.key), "ssh_key") t.copy_file(helpers.session_tempfile(volume_script), "volume_script.sh") t.command(["chmod", "600", "ssh_key"]) ssh_command = ["ssh", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS scp_command = ["scp", "-i", "ssh_key"] + ssh_client.SHARED_SSH_OPTS for private_agent in description["private_agents"]: target = "{}@{}".format(ssh.user, private_agent["private_ip"]) t.command(scp_command + ["volume_script.sh", target + ":~/volume_script.sh"]) t.command(ssh_command + [target, "sudo", "bash", "volume_script.sh"]) # nasty hack until we add a better post-flight time.sleep(60)
def upgrade_dcos( dcos_api_session: dcos_test_utils.dcos_api_session.DcosApiSession, onprem_cluster: dcos_test_utils.onprem.OnpremCluster, starting_version: str, installer_url: str, user_config: dict, platform: str) -> None: """ Performs the documented upgrade process on a cluster Note: This is intended for testing purposes only and is an irreversible process Args: dcos_api_session: API session object capable of authenticating with the upgraded DC/OS cluster onprem_cluster: SSH-backed onprem abstraction for the cluster to be upgraded installer_url: URL for the installer to drive the upgrade user_config: this function already creates a viable upgrade config based on the onprem_cluster, but overrides can be provided via this dict platform: this must be `aws` as no other platform is currently supported """ assert platform == 'aws', 'AWS is the only supported platform backend currently' ssh_client = onprem_cluster.ssh_client # kill previous genconf on bootstrap host if it is still running bootstrap_host = onprem_cluster.bootstrap_host.public_ip log.info('Killing any previous installer before starting upgrade') previous_installer = ssh_client.command(bootstrap_host, [ 'docker', 'ps', '--quiet', '--filter', 'name=dcos-genconf', '--filter', 'status=running' ]).decode().strip() if previous_installer: ssh_client.command(bootstrap_host, ['docker', 'kill', previous_installer]) bootstrap_home = ssh_client.get_home_dir(bootstrap_host) log.info('Clearing out old installation files') genconf_dir = os.path.join(bootstrap_home, 'genconf') ssh_client.command(bootstrap_host, ['sudo', 'rm', '-rf', genconf_dir]) ssh_client.command(bootstrap_host, ['mkdir', genconf_dir]) installer_path = os.path.join(bootstrap_home, 'dcos_generate_config.sh') dcos_test_utils.onprem.download_dcos_installer(ssh_client, bootstrap_host, installer_path, installer_url) log.info('Starting ZooKeeper on the bootstrap node') zk_host = onprem_cluster.start_bootstrap_zk() # start the nginx that will host the bootstrap files bootstrap_url = 'http://' + onprem_cluster.start_bootstrap_nginx() with ssh_client.tunnel(bootstrap_host) as tunnel: log.info('Setting up upgrade config on bootstrap host') upgrade_config = { 'cluster_name': 'My Upgraded DC/OS', 'ssh_user': ssh_client.user, 'master_discovery': 'static', 'exhibitor_storage_backend': 'zookeeper', 'exhibitor_zk_hosts': zk_host, 'exhibitor_zk_path': '/exhibitor', 'bootstrap_url': bootstrap_url, 'rexray_config_reset': platform, 'platform': platform, 'master_list': [h.private_ip for h in onprem_cluster.masters], 'agent_list': [h.private_ip for h in onprem_cluster.private_agents], 'public_agent_list': [h.private_ip for h in onprem_cluster.public_agents] } upgrade_config.update(user_config) # transfer ip-detect and ssh key tunnel.copy_file(session_tempfile(yaml.dump(upgrade_config).encode()), os.path.join(bootstrap_home, 'genconf/config.yaml')) tunnel.copy_file(session_tempfile(ssh_client.key.encode()), os.path.join(bootstrap_home, 'genconf/ssh_key')) tunnel.command( ['chmod', '600', os.path.join(bootstrap_home, 'genconf/ssh_key')]) ip_detect_script = pkg_resources.resource_string( 'dcos_test_utils', 'ip-detect/{}.sh'.format(platform)).decode('utf-8') tunnel.copy_file(session_tempfile(ip_detect_script.encode()), os.path.join(bootstrap_home, 'genconf/ip-detect')) log.info('Generating node upgrade script') upgrade_script_path = tunnel.command([ 'bash', installer_path, '--generate-node-upgrade-script ' + starting_version ]).decode('utf-8').splitlines()[-1].split("Node upgrade script URL: ", 1)[1] log.info('Editing node upgrade script...') # Remove docker (and associated journald) restart from the install # script. This prevents Docker-containerized tasks from being killed # during agent upgrades. tunnel.command([ 'sudo', 'sed', '-i', '-e', '"s/systemctl restart systemd-journald//g"', '-e', '"s/systemctl restart docker//g"', bootstrap_home + '/genconf/serve/dcos_install.sh' ]) tunnel.command(['docker', 'restart', 'dcos-bootstrap-nginx']) # upgrading can finally start master_list = [host.public_ip for host in onprem_cluster.masters] private_agent_list = [ host.public_ip for host in onprem_cluster.private_agents ] public_agent_list = [ host.public_ip for host in onprem_cluster.public_agents ] upgrade_ordering = [ # Upgrade masters in a random order. ('master', 'master', random.sample(master_list, len(master_list))), ('slave', 'agent', private_agent_list), ('slave_public', 'public agent', public_agent_list) ] logging.info('\n'.join(['Upgrade plan:'] + [ '{} ({})'.format(host, role_name) for _, role_name, hosts in upgrade_ordering for host in hosts ])) for role, role_name, hosts in upgrade_ordering: log.info('Upgrading {} nodes: {}'.format(role_name, repr(hosts))) for host in hosts: log.info('Upgrading {}: {}'.format(role_name, repr(host))) ssh_client.command(host, [ 'curl', '--silent', '--verbose', '--show-error', '--fail', '--location', '--keepalive-time', '2', '--retry', '20', '--speed-limit', '100000', '--speed-time', '60', '--remote-name', upgrade_script_path ]) ssh_client.command(host, ['sudo', 'bash', 'dcos_node_upgrade.sh']) wait_metric = { 'master': 'registrar/log/recovered', 'slave': 'slave/registered', 'slave_public': 'slave/registered', }[role] log.info( 'Waiting for {} to rejoin the cluster...'.format(role_name)) try: wait_for_mesos_metric(dcos_api_session, host, wait_metric, 1) except retrying.RetryError as exc: raise Exception( 'Timed out waiting for {} to rejoin the cluster after upgrade: {}' .format(role_name, repr(host))) from exc dcos_api_session.wait_for_dcos()