Ejemplo n.º 1
0
def start_agent(hostname):
    """ Starts an agent process at the host

    :param hostname: host or IP of the machine to start the agent process.
    """

    run_command_on_agent(hostname, "sudo systemctl start dcos-mesos-slave")
Ejemplo n.º 2
0
def restore_iptables(host):
    """ Reconnect a previously partitioned node to the network
        :param hostname: host or IP of the machine to partition from the cluster
    """

    cmd = 'if [ -e iptables.rules ]; then sudo iptables-restore < iptables.rules && rm iptables.rules ; fi'
    run_command_on_agent(host, cmd)
Ejemplo n.º 3
0
def docker_ipv6_network_fixture():
    agents = get_agents()
    network_cmd = f"sudo docker network create --driver=bridge --ipv6 --subnet=fd01::/64 mesos-docker-ipv6-test"
    for agent in agents:
        run_command_on_agent(agent, network_cmd)
    yield
    for agent in agents:
        run_command_on_agent(agent, f"sudo docker network rm mesos-docker-ipv6-test")
Ejemplo n.º 4
0
def docker_ipv6_network_fixture():
    agents = get_agents()
    network_cmd = f"sudo docker network create --driver=bridge --ipv6 --subnet=fd01::/64 mesos-docker-ipv6-test"
    for agent in agents:
        run_command_on_agent(agent, network_cmd)
    yield
    for agent in agents:
        run_command_on_agent(agent,
                             f"sudo docker network rm mesos-docker-ipv6-test")
Ejemplo n.º 5
0
def delete_agent_log(hostname):
    """ Deletes the agent log at the host.  This is necessary if any changes
    occurred to the agent resources and the agent is restarted.

    :param hostname: host or IP of the machine to delete the agent log.
    """

    run_command_on_agent(hostname,
                         "sudo rm -f /var/lib/mesos/slave/meta/slaves/latest")
Ejemplo n.º 6
0
def block_iptable_rules_for_seconds(host, port_number, sleep_seconds, block_input=True, block_output=True):
    """ For testing network partitions we alter iptables rules to block ports for some time.
        We do that as a single SSH command because otherwise it makes it hard to ensure that iptable rules are restored.
    """
    filename = 'iptables-{}.rules'.format(uuid.uuid4().hex)
    cmd = """
          if [ ! -e {backup} ] ; then sudo iptables-save > {backup} ; fi;
          {block}
          sleep {seconds};
          if [ -e {backup} ]; then sudo iptables-restore < {backup} && sudo rm {backup} ; fi
        """.format(backup=filename, seconds=sleep_seconds,
                   block=iptables_block_string(block_input, block_output, port_number))

    run_command_on_agent(host, cmd)
Ejemplo n.º 7
0
def kill_process_from_pid_file_on_host(hostname, pid_file='app.pid'):
    """ Retrieves the PID of a process from a pid file on host and kills it.

    :param hostname: the hostname or ip address of the host on which the process will be killed
    :param pid_file: pid file to use holding the pid number to kill
    """
    status, pid = run_command_on_agent(hostname, 'cat {}'.format(pid_file))
    status, stdout = run_command_on_agent(hostname,
                                          "sudo kill -9 {}".format(pid))
    if status:
        print("Killed pid: {}".format(pid))
        run_command_on_agent(hostname, 'rm {}'.format(pid_file))
    else:
        print("Unable to killed pid: {}".format(pid))
Ejemplo n.º 8
0
def test_external_volume():
    volume_name = "marathon-si-test-vol-{}".format(uuid.uuid4().hex)
    app_def = apps.external_volume_mesos_app()
    app_def["container"]["volumes"][0]["external"]["name"] = volume_name
    app_id = app_def['id']

    # Tested with root marathon since MoM doesn't have
    # --enable_features external_volumes option activated.
    # First deployment should create the volume since it has a unique name
    try:
        print('INFO: Deploying {} with external volume {}'.format(app_id, volume_name))
        client = marathon.create_client()
        client.add_app(app_def)
        deployment_wait(service_id=app_id)

        # Create the app: the volume should be successfully created
        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Scale down to 0
        print('INFO: Scaling {} to 0 instances'.format(app_id))
        client.stop_app(app_id)
        deployment_wait(service_id=app_id)

        # Scale up again: the volume should be successfully reused
        print('INFO: Scaling {} back to 1 instance'.format(app_id))
        client.scale_app(app_id, 1)
        deployment_wait(service_id=app_id)

        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Remove the app to be able to remove the volume
        print('INFO: Finally removing {}'.format(app_id))
        client.remove_app(app_id)
        deployment_wait(service_id=app_id)
    except Exception as e:
        print('Fail to test external volumes: {}'.format(e))
        raise e
    finally:
        # Clean up after the test: external volumes are not destroyed by marathon or dcos
        # and have to be cleaned manually.
        cmd = 'sudo /opt/mesosphere/bin/dvdcli remove --volumedriver=rexray --volumename={}'.format(volume_name)
        removed = False
        for agent in get_private_agents():
            status, output = run_command_on_agent(agent, cmd)  # NOQA
            print('DEBUG: Failed to remove external volume with name={} on agent={}: {}'.format(
                volume_name, agent, output))
            if status:
                removed = True
        # Note: Removing the volume might fail sometimes because EC2 takes some time (~10min) to recognize that
        # the volume is not in use anymore hence preventing it's removal. This is a known pitfall: we log the error
        # and the volume should be cleaned up manually later.
        if not removed:
            print('WARNING: Failed to remove external volume with name={}'.format(volume_name))
        else:
            print('DEBUG: External volume with name={} successfully removed'.format(volume_name))
Ejemplo n.º 9
0
def test_external_volume():
    volume_name = "marathon-si-test-vol-{}".format(uuid.uuid4().hex)
    app_def = apps.external_volume_mesos_app()
    app_def["container"]["volumes"][0]["external"]["name"] = volume_name
    app_id = app_def['id']

    # Tested with root marathon since MoM doesn't have
    # --enable_features external_volumes option activated.
    # First deployment should create the volume since it has a unique name
    try:
        print('INFO: Deploying {} with external volume {}'.format(app_id, volume_name))
        client = marathon.create_client()
        client.add_app(app_def)
        deployment_wait(service_id=app_id)

        # Create the app: the volume should be successfully created
        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Scale down to 0
        print('INFO: Scaling {} to 0 instances'.format(app_id))
        client.stop_app(app_id)
        deployment_wait(service_id=app_id)

        # Scale up again: the volume should be successfully reused
        print('INFO: Scaling {} back to 1 instance'.format(app_id))
        client.scale_app(app_id, 1)
        deployment_wait(service_id=app_id)

        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Remove the app to be able to remove the volume
        print('INFO: Finally removing {}'.format(app_id))
        client.remove_app(app_id)
        deployment_wait(service_id=app_id)
    except Exception as e:
        print('Fail to test external volumes: {}'.format(e))
        raise e
    finally:
        # Clean up after the test: external volumes are not destroyed by marathon or dcos
        # and have to be cleaned manually.
        cmd = 'sudo /opt/mesosphere/bin/dvdcli remove --volumedriver=rexray --volumename={}'.format(volume_name)
        removed = False
        for agent in get_private_agents():
            status, output = run_command_on_agent(agent, cmd)  # NOQA
            print('DEBUG: Failed to remove external volume with name={} on agent={}: {}'.format(
                volume_name, agent, output))
            if status:
                removed = True
        # Note: Removing the volume might fail sometimes because EC2 takes some time (~10min) to recognize that
        # the volume is not in use anymore hence preventing it's removal. This is a known pitfall: we log the error
        # and the volume should be cleaned up manually later.
        if not removed:
            print('WARNING: Failed to remove external volume with name={}'.format(volume_name))
        else:
            print('DEBUG: External volume with name={} successfully removed'.format(volume_name))
Ejemplo n.º 10
0
def kill_process_on_host(hostname, pattern):
    """ Kill the process matching pattern at ip

        :param hostname: the hostname or ip address of the host on which the process will be killed
        :param pattern: a regular expression matching the name of the process to kill
    """

    status, stdout = run_command_on_agent(
        hostname, "ps aux | grep -v grep | grep '{}'".format(pattern))
    pids = [p.strip().split()[1] for p in stdout.splitlines()]

    for pid in pids:
        status, stdout = run_command_on_agent(hostname,
                                              "sudo kill -9 {}".format(pid))
        if status:
            print("Killed pid: {}".format(pid))
        else:
            print("Unable to killed pid: {}".format(pid))
Ejemplo n.º 11
0
def block_iptable_rules_for_seconds(host,
                                    port_number,
                                    sleep_seconds,
                                    block_input=True,
                                    block_output=True):
    """ For testing network partitions we alter iptables rules to block ports for some time.
        We do that as a single SSH command because otherwise it makes it hard to ensure that iptable rules are restored.
    """
    filename = 'iptables-{}.rules'.format(uuid.uuid4().hex)
    cmd = """
          if [ ! -e {backup} ] ; then sudo iptables-save > {backup} ; fi;
          {block}
          sleep {seconds};
          if [ -e {backup} ]; then sudo iptables-restore < {backup} && sudo rm {backup} ; fi
        """.format(backup=filename,
                   seconds=sleep_seconds,
                   block=iptables_block_string(block_input, block_output,
                                               port_number))

    run_command_on_agent(host, cmd)
Ejemplo n.º 12
0
def archive_sandboxes():
    # Nothing to setup
    yield
    logger.info('>>> Archiving Mesos sandboxes')
    # We tarball the sandboxes from all the agents first and download them afterwards
    for agent in get_private_agents():
        file_name = 'sandbox_{}.tar.gz'.format(agent.replace(".", "_"))
        cmd = 'sudo tar --exclude=provisioner -zcf {} /var/lib/mesos/slave'.format(file_name)
        status, output = run_command_on_agent(agent, cmd)  # NOQA

        if status:
            copy_file_from_agent(agent, file_name)
        else:
            logger.warning('Failed to tarball the sandbox from the agent={}, output={}'.format(agent, output))
Ejemplo n.º 13
0
def kill_process_on_host(hostname, pattern):
    """ Kill the process matching pattern at ip

        :param hostname: the hostname or ip address of the host on which the process will be killed
        :param pattern: a regular expression matching the name of the process to kill
        :return: IDs of processes that got either killed or terminated on their own
    """

    cmd = "ps aux | grep -v grep | grep '{}' | awk '{{ print $2 }}' | tee >(xargs sudo kill -9)".format(pattern)
    status, stdout = run_command_on_agent(hostname, cmd)
    pids = [p.strip() for p in stdout.splitlines()]
    if pids:
        logger.info("Killed pids: {}".format(", ".join(pids)))
    else:
        logger.info("Killed no pids")
    return pids
Ejemplo n.º 14
0
def kill_process_on_host(hostname, pattern):
    """ Kill the process matching pattern at ip

        :param hostname: the hostname or ip address of the host on which the process will be killed
        :param pattern: a regular expression matching the name of the process to kill
        :return: IDs of processes that got either killed or terminated on their own
    """

    cmd = "ps aux | grep -v grep | grep '{}' | awk '{{ print $2 }}' | tee >(xargs sudo kill -9)".format(
        pattern)
    status, stdout = run_command_on_agent(hostname, cmd)
    pids = [p.strip() for p in stdout.splitlines()]
    if pids:
        logger.info("Killed pids: {}".format(", ".join(pids)))
    else:
        logger.info("Killed no pids")
    return pids
Ejemplo n.º 15
0
def archive_sandboxes():
    # Nothing to setup
    yield
    logger.info('>>> Archiving Mesos sandboxes')
    # We tarball the sandboxes from all the agents first and download them afterwards
    for agent in get_private_agents():
        file_name = 'sandbox_{}.tar.gz'.format(agent.replace(".", "_"))
        cmd = 'sudo tar --exclude=provisioner -zcf {} /var/lib/mesos/slave'.format(
            file_name)
        status, output = run_command_on_agent(agent, cmd)  # NOQA

        if status:
            copy_file_from_agent(agent, file_name)
        else:
            logger.warning(
                'Failed to tarball the sandbox from the agent={}, output={}'.
                format(agent, output))
Ejemplo n.º 16
0
def test_docker_port_mappings():
    """Tests that Docker ports are mapped and are accessible from the host."""

    app_def = apps.docker_http_server(app_id='/docker-port-mapping-app')
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    port = tasks[0]['ports'][0]
    cmd = r'curl -s -w "%{http_code}"'
    cmd = cmd + ' {}:{}/.dockerenv'.format(host, port)
    status, output = run_command_on_agent(host, cmd)

    assert status and output == "200", "HTTP status code is {}, but 200 was expected".format(output)
Ejemplo n.º 17
0
def test_docker_port_mappings():
    """Tests that Docker ports are mapped and are accessible from the host."""

    app_def = apps.docker_http_server(app_id='/docker-port-mapping-app')
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    port = tasks[0]['ports'][0]
    cmd = r'curl -s -w "%{http_code}"'
    cmd = cmd + ' {}:{}/.dockerenv'.format(host, port)
    status, output = run_command_on_agent(host, cmd)

    assert status and output == "200", "HTTP status code is {}, but 200 was expected".format(output)
Ejemplo n.º 18
0
def test_default_user():
    """Ensures a task is started as root by default."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    user = app.get('user')
    assert user is None, "User is {}, but it should not have been set".format(user)

    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']

    success = run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'")
    assert success, "The app is running as non-root"
Ejemplo n.º 19
0
def test_default_user():
    """Ensures a task is started as root by default."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)

    common.deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    user = app.get('user')
    assert user is None, "User is {}, but it should not have been set".format(user)

    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']

    success = run_command_on_agent(host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'")
    assert success, "The app is running as non-root"
Ejemplo n.º 20
0
def cpus_on_agent(hostname):
    """Detects number of cores on an agent"""
    status, output = run_command_on_agent(
        hostname, "cat /proc/cpuinfo | grep processor | wc -l", noisy=False)
    return int(output)
Ejemplo n.º 21
0
def save_iptables(host):
    """ Saves iptables firewall rules such they can be restored
    """

    cmd = 'if [ ! -e iptables.rules ] ; then sudo iptables -L > /dev/null && sudo iptables-save > iptables.rules ; fi'
    run_command_on_agent(host, cmd)
Ejemplo n.º 22
0
def allow_all_traffic(host):
    """ Opens up iptables on host to allow all traffic
    """

    cmd = 'sudo iptables --policy INPUT ACCEPT && sudo iptables --policy OUTPUT ACCEPT && sudo iptables --policy FORWARD ACCEPT'  # NOQA E501
    run_command_on_agent(host, cmd)
Ejemplo n.º 23
0
def flush_all_rules(host):
    """ Flushes all the iptables rules
    """
    run_command_on_agent(host, 'sudo iptables -F INPUT')
Ejemplo n.º 24
0
def run_iptables(host, rule):
    """ iptables is challenging to abstract.  This function takes a rule
        '-I INPUT -p tcp --dport 22 -j ACCEPT' and runs it on the agent.
    """
    ip_table_cmd = 'sudo iptables {}'.format(rule)
    run_command_on_agent(host, ip_table_cmd)
Ejemplo n.º 25
0
def cpus_on_agent(hostname):
    """Detects number of cores on an agent"""
    status, output = run_command_on_agent(hostname, "cat /proc/cpuinfo | grep processor | wc -l", noisy=False)
    return int(output)
Ejemplo n.º 26
0
def restart_agent_node(hostname):
    """ Restarts the agent node
    """

    run_command_on_agent(hostname, "sudo /sbin/shutdown -r now")