Example #1
0
def test_memory_limit_exceeded(kind, client):
    resources = skein.Resources(memory=128, vcores=1)
    # Allocate noticeably more memory than the 128 MB limit
    script = 'python -c "b = bytearray(int(256e6)); import time; time.sleep(10)"'

    master = services = None
    if kind == 'master':
        master = skein.Master(resources=resources, script=script)
        search_txt = "memory limit"
    else:
        services = {
            'service': skein.Service(resources=resources, script=script)
        }
        search_txt = "memory used"
    spec = skein.ApplicationSpec(name="test_memory_limit_exceeded_%s" % kind,
                                 queue="default",
                                 master=master,
                                 services=services)
    with run_application(client, spec=spec, connect=False) as app_id:
        assert wait_for_completion(client, app_id) == "FAILED"
    logs = get_logs(app_id)
    assert search_txt in logs

    if kind == 'master':
        report = client.application_report(app_id)
        assert 'memory limit' in report.diagnostics
Example #2
0
def test_security_specified(client):
    security = skein.Security.new_credentials()
    spec = skein.ApplicationSpec(
        name="test_security_specified",
        master=skein.Master(security=security,
                            script='sleep infinity')
    )
    with run_application(client, spec=spec) as app:
        assert app.security is security
        assert app.security != client.security

        spec2 = app.get_specification()

        app2 = client.connect(app.id, security=security)
        # Smoketest, can communicate
        app2.get_specification()

        app3 = client.connect(app.id)
        with pytest.raises(skein.ConnectionError):
            # Improper security credentials
            app3.get_specification()

        app.shutdown()

    remote_security = spec2.master.security
    assert remote_security.cert_bytes is None
    assert remote_security.key_bytes is None
    assert remote_security.cert_file.source.startswith('hdfs')
    assert remote_security.key_file.source.startswith('hdfs')
Example #3
0
    def _build_specification(self):
        script = self.script_template.format(
            prologue=self.prologue,
            singleuser_command=self.singleuser_command,
            epilogue=self.epilogue)

        resources = skein.Resources(memory='%d b' % self.mem_limit,
                                    vcores=self.cpu_limit)

        security = skein.Security.new_credentials()

        # Support dicts as well as File objects
        files = {
            k: skein.File.from_dict(v) if isinstance(v, dict) else v
            for k, v in self.localize_files.items()
        }

        master = skein.Master(resources=resources,
                              files=files,
                              env=self.get_env(),
                              script=script,
                              security=security)

        return skein.ApplicationSpec(
            name='jupyterhub',
            queue=self.queue,
            user=self.user.name,
            master=master,
            delegation_token_providers=self.delegation_token_providers)
Example #4
0
def test_master_driver_shutdown_sequence(kind, master_cmd, service_cmd,
                                         client, tmpdir):
    spec = skein.ApplicationSpec(
        name="test_master_driver_shutdown_sequence_%s" % kind,
        master=skein.Master(script=master_cmd),
        services={
            'service': skein.Service(
                resources=skein.Resources(memory=128, vcores=1),
                script=service_cmd
            )
        }
    )

    state = 'SUCCEEDED' if kind.endswith('succeeds') else 'FAILED'

    if kind == 'service_succeeds':
        with run_application(client, spec=spec) as app:
            wait_for_containers(app, 1, states=['SUCCEEDED'])
            assert len(app.get_containers()) == 0
            # App hangs around until driver completes
            app.shutdown()
            assert wait_for_completion(client, app.id) == state
    else:
        with run_application(client, spec=spec, connect=False) as app_id:
            # service_fails results in immediate failure
            # driver_succeeds results in immediate success
            # driver_fails results in immediate failure
            assert wait_for_completion(client, app_id) == state
Example #5
0
def test_allow_failures_max_restarts(client, allow_failures):
    name = "test_max_restarts_allow_failures_%s" % str(allow_failures).lower()
    spec = skein.ApplicationSpec(
        name=name,
        master=skein.Master(
            script="sleep infinity"
        ),
        services={
            'myservice': skein.Service(
                instances=1,
                max_restarts=2,
                allow_failures=allow_failures,
                resources=skein.Resources(memory=128, vcores=1),
                script="exit 1"
            )
        }
    )
    with run_application(client, spec=spec) as app:
        if allow_failures:
            # Service failed 3 times, doesn't keep trying to run more
            wait_for_containers(app, 3, states=['FAILED'])
            # Check still running fine after 3 failures
            time.sleep(0.5)
            app.get_specification()
            # Shutdown manually
            app.shutdown()
            assert wait_for_completion(client, app.id) == 'SUCCEEDED'
        else:
            # Service failed 3 times and then terminates
            assert wait_for_completion(client, app.id) == 'FAILED'
Example #6
0
def test_hadoop3_resource(client):
    spec = skein.ApplicationSpec(name="test_hadoop3_resources",
                                 master=skein.Master(resources=skein.Resources(
                                     memory='32 MiB', vcores=1, gpus=1),
                                                     script="sleep infinity"))
    with pytest.raises(ValueError) as exc:
        client.submit(spec)
    if HADOOP3:
        assert "Resource 'yarn.io/gpu'" in str(exc.value)
    else:
        assert "Custom resources not supported"
Example #7
0
def test_add_container(client):
    script = ('echo "$SKEIN_CONTAINER_ID - MYENV=$MYENV"\n'
              'echo "$SKEIN_CONTAINER_ID - MYENV2=$MYENV2"\n'
              'if [[ "$MYENV" == "bar" ]]; then\n'
              '  exit 1\n'
              'else\n'
              '  exit 0\n'
              'fi')

    spec = skein.ApplicationSpec(name="test_add_container",
                                 master=skein.Master(script="sleep infinity"),
                                 services={
                                     'test':
                                     skein.Service(instances=0,
                                                   resources=skein.Resources(
                                                       memory=32, vcores=1),
                                                   env={
                                                       'MYENV': 'foo',
                                                       'MYENV2': 'baz'
                                                   },
                                                   max_restarts=1,
                                                   script=script)
                                 })

    with run_application(client, spec=spec) as app:
        # Add container with new overrides
        c = app.add_container('test')
        assert c.instance == 0
        wait_for_containers(app, 1, states=['RUNNING', 'SUCCEEDED'])

        # Non-existant service
        with pytest.raises(ValueError):
            app.add_container('foobar')

        # Add container with override for MYENV
        c = app.add_container('test', {'MYENV': 'bar'})
        assert c.instance == 1

        # The new env var triggers a failure, should fail twice,
        # then fail the whole application
        assert wait_for_completion(client, app.id) == 'FAILED'

    logs = get_logs(app.id)
    assert "test_0 - MYENV=foo" in logs
    assert "test_0 - MYENV2=baz" in logs

    assert "test_1 - MYENV=bar" in logs
    assert "test_1 - MYENV2=baz" in logs

    assert "test_2 - MYENV=bar" in logs
    assert "test_2 - MYENV2=baz" in logs

    assert "test_3" not in logs
Example #8
0
    def _build_specification(self, cluster, cert_path, key_path):
        files = {
            k: skein.File.from_dict(v) if isinstance(v, dict) else v
            for k, v in cluster.config.localize_files.items()
        }

        files["dask.crt"] = cert_path
        files["dask.pem"] = key_path

        scheduler_cmd = " ".join(self.get_scheduler_command(cluster))
        worker_cmd = " ".join(
            self.get_worker_command(
                cluster,
                worker_name="$DASK_GATEWAY_WORKER_NAME",
                scheduler_address="$DASK_GATEWAY_SCHEDULER_ADDRESS",
            ))
        scheduler_script = f"{cluster.config.scheduler_setup}\n{scheduler_cmd}"
        worker_script = f"{cluster.config.worker_setup}\n{worker_cmd}"

        master = skein.Master(
            security=self._get_security(cluster),
            resources=skein.Resources(
                memory="%d b" % cluster.config.scheduler_memory,
                vcores=cluster.config.scheduler_cores,
            ),
            files=files,
            env=self.get_scheduler_env(cluster),
            script=scheduler_script,
        )

        services = {
            "dask.worker":
            skein.Service(
                resources=skein.Resources(
                    memory="%d b" % cluster.config.worker_memory,
                    vcores=cluster.config.worker_cores,
                ),
                instances=0,
                max_restarts=0,
                allow_failures=True,
                files=files,
                env=self.get_worker_env(cluster),
                script=worker_script,
            )
        }

        return skein.ApplicationSpec(
            name="dask-gateway",
            queue=cluster.config.queue,
            user=cluster.username,
            master=master,
            services=services,
        )
Example #9
0
def test_set_log_level(client):
    service = skein.Service(resources=skein.Resources(memory=128, vcores=1),
                            script='ls')
    spec = skein.ApplicationSpec(name="test_custom_log4j_properties",
                                 queue="default",
                                 master=skein.Master(log_level='debug'),
                                 services={'service': service})

    with run_application(client, spec=spec) as app:
        assert wait_for_completion(client, app.id) == 'SUCCEEDED'

    logs = get_logs(app.id)
    assert 'DEBUG' in logs
Example #10
0
def test_move_application(client):
    spec = skein.ApplicationSpec(name="test_move_application",
                                 queue="default",
                                 master=skein.Master(script="sleep infinity"))

    def assert_good_message(msg):
        # Ensure message doesn't contain traceback
        assert 'org.apache.hadoop' not in str(msg)

    with run_application(client, spec=spec) as app:
        assert client.application_report(app.id).queue == "default"

        # Successful move
        client.move_application(app.id, "apples")
        assert client.application_report(app.id).queue == "apples"

        # Not a leaf queue
        with pytest.raises(ValueError) as exc:
            client.move_application(app.id, "fruit")
        assert 'Leaf' in str(exc.value)
        assert_good_message(exc.value)

        # Queue doesn't exist
        with pytest.raises(ValueError) as exc:
            client.move_application(app.id, "missing")
        assert "doesn't exist" in str(exc.value)
        assert_good_message(exc.value)

        app.shutdown()

    # App already shutdown
    with pytest.raises(ValueError) as exc:
        client.move_application(app.id, "default")
    assert "cannot be moved" in str(exc.value)
    assert_good_message(exc.value)

    # App doesn't exist
    missing_appid = 'application_1526134340424_0012'
    with pytest.raises(ValueError) as exc:
        client.move_application(missing_appid, "default")
    # This error message is different in Hadoop 3
    assert "absent" in str(exc.value) or "doesn't exist" in str(exc.value)
    assert_good_message(exc.value)

    # Invalid application id
    with pytest.raises(ValueError) as exc:
        client.move_application("oh no", "default")
    assert "Invalid" in str(exc.value)
Example #11
0
def test_custom_log4j_properties(client, tmpdir):
    configpath = str(tmpdir.join("log4j.properties"))
    service = skein.Service(resources=skein.Resources(memory=128, vcores=1),
                            script='ls')
    spec = skein.ApplicationSpec(name="test_custom_log4j_properties",
                                 queue="default",
                                 master=skein.Master(log_config=configpath),
                                 services={'service': service})
    with open(configpath, 'w') as f:
        f.write(custom_log4j_properties)

    with run_application(client, spec=spec) as app:
        assert wait_for_completion(client, app.id) == 'SUCCEEDED'

    logs = get_logs(app.id)
    assert 'CUSTOM-LOG4J-SUCCEEDED' in logs
Example #12
0
def test_master_driver_foo(client, tmpdir):
    filpath = str(tmpdir.join("dummy-file"))
    with open(filpath, 'w') as fil:
        fil.write('foobar')

    spec = skein.ApplicationSpec(name="test_master_driver",
                                 master=skein.Master(script='ls\nenv',
                                                     env={'FOO': 'BAR'},
                                                     files={'myfile':
                                                            filpath}))
    with run_application(client, spec=spec, connect=False) as app_id:
        assert wait_for_completion(client, app_id) == 'SUCCEEDED'

    logs = get_logs(app_id)
    assert 'FOO=BAR' in logs
    assert 'myfile' in logs
Example #13
0
def test_retries_succeeds(client):
    hdfs = pytest.importorskip('pyarrow.hdfs')

    spec = skein.ApplicationSpec(
        name="test_application_retries_succeeds",
        max_attempts=2,
        master=skein.Master(script=test_retries_script_template.format(
            succeed_on='02')))
    with run_application(client, spec=spec, connect=False) as app_id:
        assert wait_for_completion(client, app_id) == 'SUCCEEDED'
    logs = get_logs(app_id)
    assert 'Failing on other attempts' in logs
    assert 'Application attempt 1 out of 2 failed, will retry' in logs
    assert 'Succeeding on attempt 02' in logs

    fs = hdfs.connect()
    assert not fs.exists("/user/testuser/.skein/%s" % app_id)
Example #14
0
    def _build_specification(self, cluster_info, cert_path, key_path):
        files = {
            k: skein.File.from_dict(v) if isinstance(v, dict) else v
            for k, v in self.localize_files.items()
        }

        files["dask.crt"] = cert_path
        files["dask.pem"] = key_path

        env = self.get_env(cluster_info)

        scheduler_script = "\n".join(
            [self.scheduler_setup, self.scheduler_command])
        worker_script = "\n".join([self.worker_setup, self.worker_command])

        master = skein.Master(
            security=self._get_security(cluster_info),
            resources=skein.Resources(memory="%d b" % self.scheduler_memory,
                                      vcores=self.scheduler_cores),
            files=files,
            env=env,
            script=scheduler_script,
        )

        services = {
            "dask.worker":
            skein.Service(
                resources=skein.Resources(memory="%d b" % self.worker_memory,
                                          vcores=self.worker_cores),
                instances=0,
                max_restarts=0,
                allow_failures=True,
                files=files,
                env=env,
                script=worker_script,
            )
        }

        return skein.ApplicationSpec(
            name="dask-gateway",
            queue=self.queue,
            user=cluster_info.username,
            master=master,
            services=services,
        )
Example #15
0
def test_retries_fails(client):
    hdfs = pytest.importorskip('pyarrow.hdfs')

    # Global maximum is 2, checks that appmaster uses 2 instead of 10
    max_attempts = 10

    spec = skein.ApplicationSpec(
        name="test_application_retries_fails",
        max_attempts=max_attempts,
        master=skein.Master(script=test_retries_script_template.format(
            succeed_on='03')))
    with run_application(client, spec=spec, connect=False) as app_id:
        assert wait_for_completion(client, app_id) == 'FAILED'
    logs = get_logs(app_id)
    assert logs.count('Failing on other attempts') == 2
    assert 'Application attempt 1 out of 2 failed' in logs

    fs = hdfs.connect()
    assert not fs.exists("/user/testuser/.skein/%s" % app_id)
Example #16
0
    async def post(self):
        # Extract request parameters
        queue = self.get_argument('queue') or 'default'
        memory = float(self.get_argument('memory'))
        vcores = int(self.get_argument('vcores'))
        try:
            script = self.request.files['script'][0]
        except (IndexError, KeyError):
            raise web.HTTPError(400, reason="Missing script")

        # Check memory and vcores are in bounds
        if memory < 0.5 or memory > 8:
            raise web.HTTPError("0.5 <= memory <= 8 required")
        if vcores < 1 or vcores > 4:
            raise web.HTTPError("1 <= vcores <= 4 required")

        # We need to write the script temporarily to disk so Skein can upload it
        with tempfile.NamedTemporaryFile() as f:
            f.write(script['body'])
            f.file.flush()

            # ** Construct the application specification **
            # Note that we specify the user as user logged in to the web page.
            # If kerberos authentication was used, this would match the user's
            # principal.
            spec = skein.ApplicationSpec(
                name="pyscript",
                queue=queue,
                user=self.current_user,
                master=skein.Master(resources=skein.Resources(memory="%f GiB" %
                                                              memory,
                                                              vcores=vcores),
                                    files={script['filename']: f.name},
                                    script="python %s" % script['filename']))

            # Submit the application and get a report
            report = await ioloop.IOLoop.current().run_in_executor(
                None, self.submit_and_report, spec)

        # Redirect the user to the application's tracking url
        self.redirect(report.tracking_url)
Example #17
0
def test_submit_failure_removes_appdir(client):
    hdfs = pytest.importorskip('pyarrow.hdfs')

    # Application with vcores > max
    spec = skein.ApplicationSpec(name="test_submit_failure_removes_appdir",
                                 queue="default",
                                 master=skein.Master(
                                     resources=skein.Resources(vcores=1000,
                                                               memory=32),
                                     script="echo 'should never run'"))

    fs = hdfs.connect()
    before = set(fs.ls("/user/testuser/.skein"))

    with pytest.raises(skein.DriverError):
        client.submit(spec)

    after = set(fs.ls("/user/testuser/.skein"))

    # Application directory is cleaned up on failure
    assert before == after
Example #18
0
def test_container_environment(runon, client, has_kerberos_enabled):
    script = ('set -e\n'
              'env\n'
              'echo "LOGIN_ID=[$(whoami)]"\n'
              'hdfs dfs -touchz /user/testuser/test_container_permissions\n'
              'yarn application -list')
    kwargs = dict(resources=skein.Resources(memory=512, vcores=1),
                  script=script)
    services = master = None
    if runon == 'service':
        services = {'service': skein.Service(**kwargs)}
    else:
        master = skein.Master(**kwargs)

    spec = skein.ApplicationSpec(name="test_container_permissions_%s" % runon,
                                 queue="default",
                                 services=services,
                                 master=master)

    with run_application(client, spec=spec, connect=False) as app_id:
        assert wait_for_completion(client, app_id) == 'SUCCEEDED'

    logs = get_logs(app_id)
    assert "USER=testuser" in logs
    assert 'SKEIN_APPMASTER_ADDRESS=' in logs
    assert 'SKEIN_APPLICATION_ID=%s' % app_id in logs
    if runon == 'service':
        assert 'SKEIN_CONTAINER_ID=service_0' in logs
    assert 'SKEIN_RESOURCE_MEMORY=512' in logs
    assert 'SKEIN_RESOURCE_VCORES=1' in logs
    assert 'CLASSPATH' not in logs

    if has_kerberos_enabled:
        assert "LOGIN_ID=[testuser]" in logs
        assert "HADOOP_USER_NAME" not in logs
    else:
        assert "LOGIN_ID=[yarn]" in logs
        assert "HADOOP_USER_NAME" in logs
Example #19
0
import pytest

import skein
from skein.exceptions import ConnectionError
from skein.test.conftest import run_application, wait_for_containers

requests = pytest.importorskip('requests')

simplehttp = skein.Service(resources=skein.Resources(memory=32, vcores=1),
                           script='/usr/bin/python -m SimpleHTTPServer 8888')
master = skein.Master(resources=skein.Resources(memory=256, vcores=1),
                      script="sleep infinity")
spec = skein.ApplicationSpec(name="test_webui",
                             queue="default",
                             master=master,
                             services={'simplehttp': simplehttp})

LOGIN = '******'
PAGES = [('route-1', 'http://worker.example.com:8888', 'link 1'),
         ('route-2', 'http://worker.example.com:8888/tmp/', 'link 2'),
         ('route-3', 'http://worker.example.com:8888/container_tokens',
          'link 3'), ('route-4', 'http://worker.example.com:8888', None)]


def get_page(address, **kwargs):
    # Set timeout on all requests so tests don't hang on error
    kwargs.setdefault('timeout', 5)
    return requests.get(address, **kwargs)


@pytest.fixture
Example #20
0
def test_dynamic_containers(client):
    spec = skein.ApplicationSpec(name="test_dynamic_containers",
                                 services={
                                     'sleeper':
                                     skein.Service(instances=1,
                                                   resources=skein.Resources(
                                                       memory=32, vcores=1),
                                                   script='sleep infinity')
                                 },
                                 master=skein.Master(script='sleep infinity'))
    with run_application(client, spec=spec) as app:
        initial = wait_for_containers(app, 1, states=['RUNNING'])
        assert initial[0].state == 'RUNNING'
        assert initial[0].service_name == 'sleeper'

        # Scale sleepers up to 3 containers
        new = app.scale('sleeper', 3)
        assert len(new) == 2
        for c in new:
            assert c.state == 'REQUESTED'
        wait_for_containers(app, 3, services=['sleeper'], states=['RUNNING'])

        # Scale down to 1 container
        stopped = app.scale('sleeper', 1)
        assert len(stopped) == 2
        # Stopped oldest 2 instances
        assert stopped[0].instance == 0
        assert stopped[1].instance == 1

        # Scale up to 2 containers
        new = app.scale('sleeper', 2)
        # Calling twice is no-op
        new2 = app.scale('sleeper', 2)
        assert len(new2) == 0
        assert new[0].instance == 3
        current = wait_for_containers(app,
                                      2,
                                      services=['sleeper'],
                                      states=['RUNNING'])
        assert current[0].instance == 2
        assert current[1].instance == 3

        # Manually kill instance 3
        app.kill_container('sleeper_3')
        current = app.get_containers()
        assert len(current) == 1
        assert current[0].instance == 2

        # Fine to kill already killed container
        app.kill_container('sleeper_1')

        # All killed containers
        killed = app.get_containers(states=['killed'])
        assert len(killed) == 3
        assert [c.instance for c in killed] == [0, 1, 3]
        # All completed containers have an exit message
        assert all(c.exit_message for c in killed)

        # Add containers by delta
        ncurrent = len(app.get_containers())
        new = app.scale('sleeper', delta=2)
        assert len(new) == 2
        assert len(app.get_containers()) == ncurrent + 2

        # Remove containers by delta
        ncurrent = len(app.get_containers())
        assert ncurrent >= 1
        res = app.scale('sleeper', delta=-1)
        assert len(res) == 1
        assert len(app.get_containers()) == ncurrent - 1

        # Removing more containers than active removes all containers
        ncurrent = len(app.get_containers())
        res = app.scale('sleeper', delta=-(ncurrent + 2))
        assert len(res) == ncurrent
        assert len(app.get_containers()) == 0

        # Can't specify both count and delta
        with pytest.raises(ValueError):
            app.scale('sleeper', count=2, delta=2)

        # Must specify either count or delta
        with pytest.raises(ValueError):
            app.scale('sleeper')

        # Can't scale non-existant service
        with pytest.raises(ValueError):
            app.scale('foobar', 2)

        # Can't scale negative
        with pytest.raises(ValueError):
            app.scale('sleeper', -5)

        # Can't kill non-existant container
        with pytest.raises(ValueError):
            app.kill_container('foobar_1')

        with pytest.raises(ValueError):
            app.kill_container('sleeper_500')

        # Invalid container id
        with pytest.raises(ValueError):
            app.kill_container('fooooooo')

        # Can't get containers for non-existant service
        with pytest.raises(ValueError):
            app.get_containers(services=['sleeper', 'missing'])

        app.shutdown()