def start_agent(hostname): """ Starts an agent process at the host :param hostname: host or IP of the machine to start the agent process. """ run_command_on_agent(hostname, "sudo systemctl start dcos-mesos-slave")
def restore_iptables(host): """ Reconnect a previously partitioned node to the network :param hostname: host or IP of the machine to partition from the cluster """ cmd = 'if [ -e iptables.rules ]; then sudo iptables-restore < iptables.rules && rm iptables.rules ; fi' run_command_on_agent(host, cmd)
def docker_ipv6_network_fixture(): agents = get_agents() network_cmd = f"sudo docker network create --driver=bridge --ipv6 --subnet=fd01::/64 mesos-docker-ipv6-test" for agent in agents: run_command_on_agent(agent, network_cmd) yield for agent in agents: run_command_on_agent(agent, f"sudo docker network rm mesos-docker-ipv6-test")
def docker_ipv6_network_fixture(): agents = get_agents() network_cmd = f"sudo docker network create --driver=bridge --ipv6 --subnet=fd01::/64 mesos-docker-ipv6-test" for agent in agents: run_command_on_agent(agent, network_cmd) yield for agent in agents: run_command_on_agent(agent, f"sudo docker network rm mesos-docker-ipv6-test")
def delete_agent_log(hostname): """ Deletes the agent log at the host. This is necessary if any changes occurred to the agent resources and the agent is restarted. :param hostname: host or IP of the machine to delete the agent log. """ run_command_on_agent(hostname, "sudo rm -f /var/lib/mesos/slave/meta/slaves/latest")
def block_iptable_rules_for_seconds(host, port_number, sleep_seconds, block_input=True, block_output=True): """ For testing network partitions we alter iptables rules to block ports for some time. We do that as a single SSH command because otherwise it makes it hard to ensure that iptable rules are restored. """ filename = 'iptables-{}.rules'.format(uuid.uuid4().hex) cmd = """ if [ ! -e {backup} ] ; then sudo iptables-save > {backup} ; fi; {block} sleep {seconds}; if [ -e {backup} ]; then sudo iptables-restore < {backup} && sudo rm {backup} ; fi """.format(backup=filename, seconds=sleep_seconds, block=iptables_block_string(block_input, block_output, port_number)) run_command_on_agent(host, cmd)
def kill_process_from_pid_file_on_host(hostname, pid_file='app.pid'): """ Retrieves the PID of a process from a pid file on host and kills it. :param hostname: the hostname or ip address of the host on which the process will be killed :param pid_file: pid file to use holding the pid number to kill """ status, pid = run_command_on_agent(hostname, 'cat {}'.format(pid_file)) status, stdout = run_command_on_agent(hostname, "sudo kill -9 {}".format(pid)) if status: print("Killed pid: {}".format(pid)) run_command_on_agent(hostname, 'rm {}'.format(pid_file)) else: print("Unable to killed pid: {}".format(pid))
def test_external_volume(): volume_name = "marathon-si-test-vol-{}".format(uuid.uuid4().hex) app_def = apps.external_volume_mesos_app() app_def["container"]["volumes"][0]["external"]["name"] = volume_name app_id = app_def['id'] # Tested with root marathon since MoM doesn't have # --enable_features external_volumes option activated. # First deployment should create the volume since it has a unique name try: print('INFO: Deploying {} with external volume {}'.format(app_id, volume_name)) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) # Create the app: the volume should be successfully created common.assert_app_tasks_running(client, app_def) common.assert_app_tasks_healthy(client, app_def) # Scale down to 0 print('INFO: Scaling {} to 0 instances'.format(app_id)) client.stop_app(app_id) deployment_wait(service_id=app_id) # Scale up again: the volume should be successfully reused print('INFO: Scaling {} back to 1 instance'.format(app_id)) client.scale_app(app_id, 1) deployment_wait(service_id=app_id) common.assert_app_tasks_running(client, app_def) common.assert_app_tasks_healthy(client, app_def) # Remove the app to be able to remove the volume print('INFO: Finally removing {}'.format(app_id)) client.remove_app(app_id) deployment_wait(service_id=app_id) except Exception as e: print('Fail to test external volumes: {}'.format(e)) raise e finally: # Clean up after the test: external volumes are not destroyed by marathon or dcos # and have to be cleaned manually. cmd = 'sudo /opt/mesosphere/bin/dvdcli remove --volumedriver=rexray --volumename={}'.format(volume_name) removed = False for agent in get_private_agents(): status, output = run_command_on_agent(agent, cmd) # NOQA print('DEBUG: Failed to remove external volume with name={} on agent={}: {}'.format( volume_name, agent, output)) if status: removed = True # Note: Removing the volume might fail sometimes because EC2 takes some time (~10min) to recognize that # the volume is not in use anymore hence preventing it's removal. This is a known pitfall: we log the error # and the volume should be cleaned up manually later. if not removed: print('WARNING: Failed to remove external volume with name={}'.format(volume_name)) else: print('DEBUG: External volume with name={} successfully removed'.format(volume_name))
def test_external_volume(): volume_name = "marathon-si-test-vol-{}".format(uuid.uuid4().hex) app_def = apps.external_volume_mesos_app() app_def["container"]["volumes"][0]["external"]["name"] = volume_name app_id = app_def['id'] # Tested with root marathon since MoM doesn't have # --enable_features external_volumes option activated. # First deployment should create the volume since it has a unique name try: print('INFO: Deploying {} with external volume {}'.format(app_id, volume_name)) client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) # Create the app: the volume should be successfully created common.assert_app_tasks_running(client, app_def) common.assert_app_tasks_healthy(client, app_def) # Scale down to 0 print('INFO: Scaling {} to 0 instances'.format(app_id)) client.stop_app(app_id) deployment_wait(service_id=app_id) # Scale up again: the volume should be successfully reused print('INFO: Scaling {} back to 1 instance'.format(app_id)) client.scale_app(app_id, 1) deployment_wait(service_id=app_id) common.assert_app_tasks_running(client, app_def) common.assert_app_tasks_healthy(client, app_def) # Remove the app to be able to remove the volume print('INFO: Finally removing {}'.format(app_id)) client.remove_app(app_id) deployment_wait(service_id=app_id) except Exception as e: print('Fail to test external volumes: {}'.format(e)) raise e finally: # Clean up after the test: external volumes are not destroyed by marathon or dcos # and have to be cleaned manually. cmd = 'sudo /opt/mesosphere/bin/dvdcli remove --volumedriver=rexray --volumename={}'.format(volume_name) removed = False for agent in get_private_agents(): status, output = run_command_on_agent(agent, cmd) # NOQA print('DEBUG: Failed to remove external volume with name={} on agent={}: {}'.format( volume_name, agent, output)) if status: removed = True # Note: Removing the volume might fail sometimes because EC2 takes some time (~10min) to recognize that # the volume is not in use anymore hence preventing it's removal. This is a known pitfall: we log the error # and the volume should be cleaned up manually later. if not removed: print('WARNING: Failed to remove external volume with name={}'.format(volume_name)) else: print('DEBUG: External volume with name={} successfully removed'.format(volume_name))
def kill_process_on_host(hostname, pattern): """ Kill the process matching pattern at ip :param hostname: the hostname or ip address of the host on which the process will be killed :param pattern: a regular expression matching the name of the process to kill """ status, stdout = run_command_on_agent( hostname, "ps aux | grep -v grep | grep '{}'".format(pattern)) pids = [p.strip().split()[1] for p in stdout.splitlines()] for pid in pids: status, stdout = run_command_on_agent(hostname, "sudo kill -9 {}".format(pid)) if status: print("Killed pid: {}".format(pid)) else: print("Unable to killed pid: {}".format(pid))
def block_iptable_rules_for_seconds(host, port_number, sleep_seconds, block_input=True, block_output=True): """ For testing network partitions we alter iptables rules to block ports for some time. We do that as a single SSH command because otherwise it makes it hard to ensure that iptable rules are restored. """ filename = 'iptables-{}.rules'.format(uuid.uuid4().hex) cmd = """ if [ ! -e {backup} ] ; then sudo iptables-save > {backup} ; fi; {block} sleep {seconds}; if [ -e {backup} ]; then sudo iptables-restore < {backup} && sudo rm {backup} ; fi """.format(backup=filename, seconds=sleep_seconds, block=iptables_block_string(block_input, block_output, port_number)) run_command_on_agent(host, cmd)
def archive_sandboxes(): # Nothing to setup yield logger.info('>>> Archiving Mesos sandboxes') # We tarball the sandboxes from all the agents first and download them afterwards for agent in get_private_agents(): file_name = 'sandbox_{}.tar.gz'.format(agent.replace(".", "_")) cmd = 'sudo tar --exclude=provisioner -zcf {} /var/lib/mesos/slave'.format(file_name) status, output = run_command_on_agent(agent, cmd) # NOQA if status: copy_file_from_agent(agent, file_name) else: logger.warning('Failed to tarball the sandbox from the agent={}, output={}'.format(agent, output))
def kill_process_on_host(hostname, pattern): """ Kill the process matching pattern at ip :param hostname: the hostname or ip address of the host on which the process will be killed :param pattern: a regular expression matching the name of the process to kill :return: IDs of processes that got either killed or terminated on their own """ cmd = "ps aux | grep -v grep | grep '{}' | awk '{{ print $2 }}' | tee >(xargs sudo kill -9)".format(pattern) status, stdout = run_command_on_agent(hostname, cmd) pids = [p.strip() for p in stdout.splitlines()] if pids: logger.info("Killed pids: {}".format(", ".join(pids))) else: logger.info("Killed no pids") return pids
def kill_process_on_host(hostname, pattern): """ Kill the process matching pattern at ip :param hostname: the hostname or ip address of the host on which the process will be killed :param pattern: a regular expression matching the name of the process to kill :return: IDs of processes that got either killed or terminated on their own """ cmd = "ps aux | grep -v grep | grep '{}' | awk '{{ print $2 }}' | tee >(xargs sudo kill -9)".format( pattern) status, stdout = run_command_on_agent(hostname, cmd) pids = [p.strip() for p in stdout.splitlines()] if pids: logger.info("Killed pids: {}".format(", ".join(pids))) else: logger.info("Killed no pids") return pids
def archive_sandboxes(): # Nothing to setup yield logger.info('>>> Archiving Mesos sandboxes') # We tarball the sandboxes from all the agents first and download them afterwards for agent in get_private_agents(): file_name = 'sandbox_{}.tar.gz'.format(agent.replace(".", "_")) cmd = 'sudo tar --exclude=provisioner -zcf {} /var/lib/mesos/slave'.format( file_name) status, output = run_command_on_agent(agent, cmd) # NOQA if status: copy_file_from_agent(agent, file_name) else: logger.warning( 'Failed to tarball the sandbox from the agent={}, output={}'. format(agent, output))
def test_docker_port_mappings(): """Tests that Docker ports are mapped and are accessible from the host.""" app_def = apps.docker_http_server(app_id='/docker-port-mapping-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) host = tasks[0]['host'] port = tasks[0]['ports'][0] cmd = r'curl -s -w "%{http_code}"' cmd = cmd + ' {}:{}/.dockerenv'.format(host, port) status, output = run_command_on_agent(host, cmd) assert status and output == "200", "HTTP status code is {}, but 200 was expected".format(output)
def test_docker_port_mappings(): """Tests that Docker ports are mapped and are accessible from the host.""" app_def = apps.docker_http_server(app_id='/docker-port-mapping-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) host = tasks[0]['host'] port = tasks[0]['ports'][0] cmd = r'curl -s -w "%{http_code}"' cmd = cmd + ' {}:{}/.dockerenv'.format(host, port) status, output = run_command_on_agent(host, cmd) assert status and output == "200", "HTTP status code is {}, but 200 was expected".format(output)
def test_default_user(): """Ensures a task is started as root by default.""" app_def = apps.sleep_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) deployment_wait(service_id=app_id) app = client.get_app(app_id) user = app.get('user') assert user is None, "User is {}, but it should not have been set".format(user) tasks = client.get_tasks(app_id) host = tasks[0]['host'] success = run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'") assert success, "The app is running as non-root"
def test_default_user(): """Ensures a task is started as root by default.""" app_def = apps.sleep_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) user = app.get('user') assert user is None, "User is {}, but it should not have been set".format(user) tasks = client.get_tasks(app_id) host = tasks[0]['host'] success = run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'") assert success, "The app is running as non-root"
def cpus_on_agent(hostname): """Detects number of cores on an agent""" status, output = run_command_on_agent( hostname, "cat /proc/cpuinfo | grep processor | wc -l", noisy=False) return int(output)
def save_iptables(host): """ Saves iptables firewall rules such they can be restored """ cmd = 'if [ ! -e iptables.rules ] ; then sudo iptables -L > /dev/null && sudo iptables-save > iptables.rules ; fi' run_command_on_agent(host, cmd)
def allow_all_traffic(host): """ Opens up iptables on host to allow all traffic """ cmd = 'sudo iptables --policy INPUT ACCEPT && sudo iptables --policy OUTPUT ACCEPT && sudo iptables --policy FORWARD ACCEPT' # NOQA E501 run_command_on_agent(host, cmd)
def flush_all_rules(host): """ Flushes all the iptables rules """ run_command_on_agent(host, 'sudo iptables -F INPUT')
def run_iptables(host, rule): """ iptables is challenging to abstract. This function takes a rule '-I INPUT -p tcp --dport 22 -j ACCEPT' and runs it on the agent. """ ip_table_cmd = 'sudo iptables {}'.format(rule) run_command_on_agent(host, ip_table_cmd)
def cpus_on_agent(hostname): """Detects number of cores on an agent""" status, output = run_command_on_agent(hostname, "cat /proc/cpuinfo | grep processor | wc -l", noisy=False) return int(output)
def restart_agent_node(hostname): """ Restarts the agent node """ run_command_on_agent(hostname, "sudo /sbin/shutdown -r now")