def test_memory_limit_exceeded(kind, client): resources = skein.Resources(memory=128, vcores=1) # Allocate noticeably more memory than the 128 MB limit script = 'python -c "b = bytearray(int(256e6)); import time; time.sleep(10)"' master = services = None if kind == 'master': master = skein.Master(resources=resources, script=script) search_txt = "memory limit" else: services = { 'service': skein.Service(resources=resources, script=script) } search_txt = "memory used" spec = skein.ApplicationSpec(name="test_memory_limit_exceeded_%s" % kind, queue="default", master=master, services=services) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == "FAILED" logs = get_logs(app_id) assert search_txt in logs if kind == 'master': report = client.application_report(app_id) assert 'memory limit' in report.diagnostics
def test_security_specified(client): security = skein.Security.new_credentials() spec = skein.ApplicationSpec( name="test_security_specified", master=skein.Master(security=security, script='sleep infinity') ) with run_application(client, spec=spec) as app: assert app.security is security assert app.security != client.security spec2 = app.get_specification() app2 = client.connect(app.id, security=security) # Smoketest, can communicate app2.get_specification() app3 = client.connect(app.id) with pytest.raises(skein.ConnectionError): # Improper security credentials app3.get_specification() app.shutdown() remote_security = spec2.master.security assert remote_security.cert_bytes is None assert remote_security.key_bytes is None assert remote_security.cert_file.source.startswith('hdfs') assert remote_security.key_file.source.startswith('hdfs')
def _build_specification(self): script = self.script_template.format( prologue=self.prologue, singleuser_command=self.singleuser_command, epilogue=self.epilogue) resources = skein.Resources(memory='%d b' % self.mem_limit, vcores=self.cpu_limit) security = skein.Security.new_credentials() # Support dicts as well as File objects files = { k: skein.File.from_dict(v) if isinstance(v, dict) else v for k, v in self.localize_files.items() } master = skein.Master(resources=resources, files=files, env=self.get_env(), script=script, security=security) return skein.ApplicationSpec( name='jupyterhub', queue=self.queue, user=self.user.name, master=master, delegation_token_providers=self.delegation_token_providers)
def test_master_driver_shutdown_sequence(kind, master_cmd, service_cmd, client, tmpdir): spec = skein.ApplicationSpec( name="test_master_driver_shutdown_sequence_%s" % kind, master=skein.Master(script=master_cmd), services={ 'service': skein.Service( resources=skein.Resources(memory=128, vcores=1), script=service_cmd ) } ) state = 'SUCCEEDED' if kind.endswith('succeeds') else 'FAILED' if kind == 'service_succeeds': with run_application(client, spec=spec) as app: wait_for_containers(app, 1, states=['SUCCEEDED']) assert len(app.get_containers()) == 0 # App hangs around until driver completes app.shutdown() assert wait_for_completion(client, app.id) == state else: with run_application(client, spec=spec, connect=False) as app_id: # service_fails results in immediate failure # driver_succeeds results in immediate success # driver_fails results in immediate failure assert wait_for_completion(client, app_id) == state
def test_allow_failures_max_restarts(client, allow_failures): name = "test_max_restarts_allow_failures_%s" % str(allow_failures).lower() spec = skein.ApplicationSpec( name=name, master=skein.Master( script="sleep infinity" ), services={ 'myservice': skein.Service( instances=1, max_restarts=2, allow_failures=allow_failures, resources=skein.Resources(memory=128, vcores=1), script="exit 1" ) } ) with run_application(client, spec=spec) as app: if allow_failures: # Service failed 3 times, doesn't keep trying to run more wait_for_containers(app, 3, states=['FAILED']) # Check still running fine after 3 failures time.sleep(0.5) app.get_specification() # Shutdown manually app.shutdown() assert wait_for_completion(client, app.id) == 'SUCCEEDED' else: # Service failed 3 times and then terminates assert wait_for_completion(client, app.id) == 'FAILED'
def test_hadoop3_resource(client): spec = skein.ApplicationSpec(name="test_hadoop3_resources", master=skein.Master(resources=skein.Resources( memory='32 MiB', vcores=1, gpus=1), script="sleep infinity")) with pytest.raises(ValueError) as exc: client.submit(spec) if HADOOP3: assert "Resource 'yarn.io/gpu'" in str(exc.value) else: assert "Custom resources not supported"
def test_add_container(client): script = ('echo "$SKEIN_CONTAINER_ID - MYENV=$MYENV"\n' 'echo "$SKEIN_CONTAINER_ID - MYENV2=$MYENV2"\n' 'if [[ "$MYENV" == "bar" ]]; then\n' ' exit 1\n' 'else\n' ' exit 0\n' 'fi') spec = skein.ApplicationSpec(name="test_add_container", master=skein.Master(script="sleep infinity"), services={ 'test': skein.Service(instances=0, resources=skein.Resources( memory=32, vcores=1), env={ 'MYENV': 'foo', 'MYENV2': 'baz' }, max_restarts=1, script=script) }) with run_application(client, spec=spec) as app: # Add container with new overrides c = app.add_container('test') assert c.instance == 0 wait_for_containers(app, 1, states=['RUNNING', 'SUCCEEDED']) # Non-existant service with pytest.raises(ValueError): app.add_container('foobar') # Add container with override for MYENV c = app.add_container('test', {'MYENV': 'bar'}) assert c.instance == 1 # The new env var triggers a failure, should fail twice, # then fail the whole application assert wait_for_completion(client, app.id) == 'FAILED' logs = get_logs(app.id) assert "test_0 - MYENV=foo" in logs assert "test_0 - MYENV2=baz" in logs assert "test_1 - MYENV=bar" in logs assert "test_1 - MYENV2=baz" in logs assert "test_2 - MYENV=bar" in logs assert "test_2 - MYENV2=baz" in logs assert "test_3" not in logs
def _build_specification(self, cluster, cert_path, key_path): files = { k: skein.File.from_dict(v) if isinstance(v, dict) else v for k, v in cluster.config.localize_files.items() } files["dask.crt"] = cert_path files["dask.pem"] = key_path scheduler_cmd = " ".join(self.get_scheduler_command(cluster)) worker_cmd = " ".join( self.get_worker_command( cluster, worker_name="$DASK_GATEWAY_WORKER_NAME", scheduler_address="$DASK_GATEWAY_SCHEDULER_ADDRESS", )) scheduler_script = f"{cluster.config.scheduler_setup}\n{scheduler_cmd}" worker_script = f"{cluster.config.worker_setup}\n{worker_cmd}" master = skein.Master( security=self._get_security(cluster), resources=skein.Resources( memory="%d b" % cluster.config.scheduler_memory, vcores=cluster.config.scheduler_cores, ), files=files, env=self.get_scheduler_env(cluster), script=scheduler_script, ) services = { "dask.worker": skein.Service( resources=skein.Resources( memory="%d b" % cluster.config.worker_memory, vcores=cluster.config.worker_cores, ), instances=0, max_restarts=0, allow_failures=True, files=files, env=self.get_worker_env(cluster), script=worker_script, ) } return skein.ApplicationSpec( name="dask-gateway", queue=cluster.config.queue, user=cluster.username, master=master, services=services, )
def test_set_log_level(client): service = skein.Service(resources=skein.Resources(memory=128, vcores=1), script='ls') spec = skein.ApplicationSpec(name="test_custom_log4j_properties", queue="default", master=skein.Master(log_level='debug'), services={'service': service}) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == 'SUCCEEDED' logs = get_logs(app.id) assert 'DEBUG' in logs
def test_move_application(client): spec = skein.ApplicationSpec(name="test_move_application", queue="default", master=skein.Master(script="sleep infinity")) def assert_good_message(msg): # Ensure message doesn't contain traceback assert 'org.apache.hadoop' not in str(msg) with run_application(client, spec=spec) as app: assert client.application_report(app.id).queue == "default" # Successful move client.move_application(app.id, "apples") assert client.application_report(app.id).queue == "apples" # Not a leaf queue with pytest.raises(ValueError) as exc: client.move_application(app.id, "fruit") assert 'Leaf' in str(exc.value) assert_good_message(exc.value) # Queue doesn't exist with pytest.raises(ValueError) as exc: client.move_application(app.id, "missing") assert "doesn't exist" in str(exc.value) assert_good_message(exc.value) app.shutdown() # App already shutdown with pytest.raises(ValueError) as exc: client.move_application(app.id, "default") assert "cannot be moved" in str(exc.value) assert_good_message(exc.value) # App doesn't exist missing_appid = 'application_1526134340424_0012' with pytest.raises(ValueError) as exc: client.move_application(missing_appid, "default") # This error message is different in Hadoop 3 assert "absent" in str(exc.value) or "doesn't exist" in str(exc.value) assert_good_message(exc.value) # Invalid application id with pytest.raises(ValueError) as exc: client.move_application("oh no", "default") assert "Invalid" in str(exc.value)
def test_custom_log4j_properties(client, tmpdir): configpath = str(tmpdir.join("log4j.properties")) service = skein.Service(resources=skein.Resources(memory=128, vcores=1), script='ls') spec = skein.ApplicationSpec(name="test_custom_log4j_properties", queue="default", master=skein.Master(log_config=configpath), services={'service': service}) with open(configpath, 'w') as f: f.write(custom_log4j_properties) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == 'SUCCEEDED' logs = get_logs(app.id) assert 'CUSTOM-LOG4J-SUCCEEDED' in logs
def test_master_driver_foo(client, tmpdir): filpath = str(tmpdir.join("dummy-file")) with open(filpath, 'w') as fil: fil.write('foobar') spec = skein.ApplicationSpec(name="test_master_driver", master=skein.Master(script='ls\nenv', env={'FOO': 'BAR'}, files={'myfile': filpath})) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == 'SUCCEEDED' logs = get_logs(app_id) assert 'FOO=BAR' in logs assert 'myfile' in logs
def test_retries_succeeds(client): hdfs = pytest.importorskip('pyarrow.hdfs') spec = skein.ApplicationSpec( name="test_application_retries_succeeds", max_attempts=2, master=skein.Master(script=test_retries_script_template.format( succeed_on='02'))) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == 'SUCCEEDED' logs = get_logs(app_id) assert 'Failing on other attempts' in logs assert 'Application attempt 1 out of 2 failed, will retry' in logs assert 'Succeeding on attempt 02' in logs fs = hdfs.connect() assert not fs.exists("/user/testuser/.skein/%s" % app_id)
def _build_specification(self, cluster_info, cert_path, key_path): files = { k: skein.File.from_dict(v) if isinstance(v, dict) else v for k, v in self.localize_files.items() } files["dask.crt"] = cert_path files["dask.pem"] = key_path env = self.get_env(cluster_info) scheduler_script = "\n".join( [self.scheduler_setup, self.scheduler_command]) worker_script = "\n".join([self.worker_setup, self.worker_command]) master = skein.Master( security=self._get_security(cluster_info), resources=skein.Resources(memory="%d b" % self.scheduler_memory, vcores=self.scheduler_cores), files=files, env=env, script=scheduler_script, ) services = { "dask.worker": skein.Service( resources=skein.Resources(memory="%d b" % self.worker_memory, vcores=self.worker_cores), instances=0, max_restarts=0, allow_failures=True, files=files, env=env, script=worker_script, ) } return skein.ApplicationSpec( name="dask-gateway", queue=self.queue, user=cluster_info.username, master=master, services=services, )
def test_retries_fails(client): hdfs = pytest.importorskip('pyarrow.hdfs') # Global maximum is 2, checks that appmaster uses 2 instead of 10 max_attempts = 10 spec = skein.ApplicationSpec( name="test_application_retries_fails", max_attempts=max_attempts, master=skein.Master(script=test_retries_script_template.format( succeed_on='03'))) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == 'FAILED' logs = get_logs(app_id) assert logs.count('Failing on other attempts') == 2 assert 'Application attempt 1 out of 2 failed' in logs fs = hdfs.connect() assert not fs.exists("/user/testuser/.skein/%s" % app_id)
async def post(self): # Extract request parameters queue = self.get_argument('queue') or 'default' memory = float(self.get_argument('memory')) vcores = int(self.get_argument('vcores')) try: script = self.request.files['script'][0] except (IndexError, KeyError): raise web.HTTPError(400, reason="Missing script") # Check memory and vcores are in bounds if memory < 0.5 or memory > 8: raise web.HTTPError("0.5 <= memory <= 8 required") if vcores < 1 or vcores > 4: raise web.HTTPError("1 <= vcores <= 4 required") # We need to write the script temporarily to disk so Skein can upload it with tempfile.NamedTemporaryFile() as f: f.write(script['body']) f.file.flush() # ** Construct the application specification ** # Note that we specify the user as user logged in to the web page. # If kerberos authentication was used, this would match the user's # principal. spec = skein.ApplicationSpec( name="pyscript", queue=queue, user=self.current_user, master=skein.Master(resources=skein.Resources(memory="%f GiB" % memory, vcores=vcores), files={script['filename']: f.name}, script="python %s" % script['filename'])) # Submit the application and get a report report = await ioloop.IOLoop.current().run_in_executor( None, self.submit_and_report, spec) # Redirect the user to the application's tracking url self.redirect(report.tracking_url)
def test_submit_failure_removes_appdir(client): hdfs = pytest.importorskip('pyarrow.hdfs') # Application with vcores > max spec = skein.ApplicationSpec(name="test_submit_failure_removes_appdir", queue="default", master=skein.Master( resources=skein.Resources(vcores=1000, memory=32), script="echo 'should never run'")) fs = hdfs.connect() before = set(fs.ls("/user/testuser/.skein")) with pytest.raises(skein.DriverError): client.submit(spec) after = set(fs.ls("/user/testuser/.skein")) # Application directory is cleaned up on failure assert before == after
def test_container_environment(runon, client, has_kerberos_enabled): script = ('set -e\n' 'env\n' 'echo "LOGIN_ID=[$(whoami)]"\n' 'hdfs dfs -touchz /user/testuser/test_container_permissions\n' 'yarn application -list') kwargs = dict(resources=skein.Resources(memory=512, vcores=1), script=script) services = master = None if runon == 'service': services = {'service': skein.Service(**kwargs)} else: master = skein.Master(**kwargs) spec = skein.ApplicationSpec(name="test_container_permissions_%s" % runon, queue="default", services=services, master=master) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == 'SUCCEEDED' logs = get_logs(app_id) assert "USER=testuser" in logs assert 'SKEIN_APPMASTER_ADDRESS=' in logs assert 'SKEIN_APPLICATION_ID=%s' % app_id in logs if runon == 'service': assert 'SKEIN_CONTAINER_ID=service_0' in logs assert 'SKEIN_RESOURCE_MEMORY=512' in logs assert 'SKEIN_RESOURCE_VCORES=1' in logs assert 'CLASSPATH' not in logs if has_kerberos_enabled: assert "LOGIN_ID=[testuser]" in logs assert "HADOOP_USER_NAME" not in logs else: assert "LOGIN_ID=[yarn]" in logs assert "HADOOP_USER_NAME" in logs
import pytest import skein from skein.exceptions import ConnectionError from skein.test.conftest import run_application, wait_for_containers requests = pytest.importorskip('requests') simplehttp = skein.Service(resources=skein.Resources(memory=32, vcores=1), script='/usr/bin/python -m SimpleHTTPServer 8888') master = skein.Master(resources=skein.Resources(memory=256, vcores=1), script="sleep infinity") spec = skein.ApplicationSpec(name="test_webui", queue="default", master=master, services={'simplehttp': simplehttp}) LOGIN = '******' PAGES = [('route-1', 'http://worker.example.com:8888', 'link 1'), ('route-2', 'http://worker.example.com:8888/tmp/', 'link 2'), ('route-3', 'http://worker.example.com:8888/container_tokens', 'link 3'), ('route-4', 'http://worker.example.com:8888', None)] def get_page(address, **kwargs): # Set timeout on all requests so tests don't hang on error kwargs.setdefault('timeout', 5) return requests.get(address, **kwargs) @pytest.fixture
def test_dynamic_containers(client): spec = skein.ApplicationSpec(name="test_dynamic_containers", services={ 'sleeper': skein.Service(instances=1, resources=skein.Resources( memory=32, vcores=1), script='sleep infinity') }, master=skein.Master(script='sleep infinity')) with run_application(client, spec=spec) as app: initial = wait_for_containers(app, 1, states=['RUNNING']) assert initial[0].state == 'RUNNING' assert initial[0].service_name == 'sleeper' # Scale sleepers up to 3 containers new = app.scale('sleeper', 3) assert len(new) == 2 for c in new: assert c.state == 'REQUESTED' wait_for_containers(app, 3, services=['sleeper'], states=['RUNNING']) # Scale down to 1 container stopped = app.scale('sleeper', 1) assert len(stopped) == 2 # Stopped oldest 2 instances assert stopped[0].instance == 0 assert stopped[1].instance == 1 # Scale up to 2 containers new = app.scale('sleeper', 2) # Calling twice is no-op new2 = app.scale('sleeper', 2) assert len(new2) == 0 assert new[0].instance == 3 current = wait_for_containers(app, 2, services=['sleeper'], states=['RUNNING']) assert current[0].instance == 2 assert current[1].instance == 3 # Manually kill instance 3 app.kill_container('sleeper_3') current = app.get_containers() assert len(current) == 1 assert current[0].instance == 2 # Fine to kill already killed container app.kill_container('sleeper_1') # All killed containers killed = app.get_containers(states=['killed']) assert len(killed) == 3 assert [c.instance for c in killed] == [0, 1, 3] # All completed containers have an exit message assert all(c.exit_message for c in killed) # Add containers by delta ncurrent = len(app.get_containers()) new = app.scale('sleeper', delta=2) assert len(new) == 2 assert len(app.get_containers()) == ncurrent + 2 # Remove containers by delta ncurrent = len(app.get_containers()) assert ncurrent >= 1 res = app.scale('sleeper', delta=-1) assert len(res) == 1 assert len(app.get_containers()) == ncurrent - 1 # Removing more containers than active removes all containers ncurrent = len(app.get_containers()) res = app.scale('sleeper', delta=-(ncurrent + 2)) assert len(res) == ncurrent assert len(app.get_containers()) == 0 # Can't specify both count and delta with pytest.raises(ValueError): app.scale('sleeper', count=2, delta=2) # Must specify either count or delta with pytest.raises(ValueError): app.scale('sleeper') # Can't scale non-existant service with pytest.raises(ValueError): app.scale('foobar', 2) # Can't scale negative with pytest.raises(ValueError): app.scale('sleeper', -5) # Can't kill non-existant container with pytest.raises(ValueError): app.kill_container('foobar_1') with pytest.raises(ValueError): app.kill_container('sleeper_500') # Invalid container id with pytest.raises(ValueError): app.kill_container('fooooooo') # Can't get containers for non-existant service with pytest.raises(ValueError): app.get_containers(services=['sleeper', 'missing']) app.shutdown()