def test_container_environment(client, has_kerberos_enabled): commands = [ 'env', 'echo "LOGIN_ID=[$(whoami)]"', 'hdfs dfs -touchz /user/testuser/test_container_permissions' ] service = skein.Service(resources=skein.Resources(memory=124, vcores=1), commands=commands) spec = skein.ApplicationSpec(name="test_container_permissions", queue="default", services={'service': service}) with run_application(client, spec=spec) as app: wait_for_success(client, app.id) logs = get_logs(app.id) assert "USER=testuser" in logs assert 'SKEIN_APPMASTER_ADDRESS=' in logs assert 'SKEIN_APPLICATION_ID=%s' % app.id in logs assert 'SKEIN_CONTAINER_ID=service_0' in logs assert 'SKEIN_RESOURCE_MEMORY=128' in logs assert 'SKEIN_RESOURCE_VCORES=1' in logs if has_kerberos_enabled: assert "LOGIN_ID=[testuser]" in logs assert "HADOOP_USER_NAME" not in logs else: assert "LOGIN_ID=[yarn]" in logs assert "HADOOP_USER_NAME" in logs
def test_webui_acls(client, has_kerberos_enabled, ui_users, checks): if has_kerberos_enabled: pytest.skip("Testing only implemented for simple authentication") service = skein.Service(resources=skein.Resources(memory=128, vcores=1), commands=['sleep infinity']) spec = skein.ApplicationSpec(name="test_webui_acls", queue="default", acls=skein.ACLs(enable=True, ui_users=ui_users), services={'sleeper': service}) with run_application(client, spec=spec) as app: # Wait for a single container initial = wait_for_containers(app, 1, states=['RUNNING']) assert initial[0].state == 'RUNNING' assert initial[0].service_name == 'sleeper' # Base url of web ui base = 'http://master.example.com:8088/proxy/%s' % app.id # Check proper subset of users allowed for user, ok in checks: resp = get_page(base + "?user.name=%s" % user) assert resp.ok == ok app.shutdown()
def _make_submit_specification(script, args=(), **kwargs): spec = _make_specification(**kwargs) environment = lookup(kwargs, "environment", "yarn.environment") files, build_script = _files_and_build_script(environment) if "dask.scheduler" in spec.services: # deploy_mode == 'remote' client_vcores = lookup(kwargs, "client_vcores", "yarn.client.vcores") client_memory = lookup(kwargs, "client_memory", "yarn.client.memory") client_env = lookup(kwargs, "client_env", "yarn.client.env") client_memory = parse_memory(client_memory, "client") script_name = os.path.basename(script) files[script_name] = script spec.services["dask.client"] = skein.Service( instances=1, resources=skein.Resources(vcores=client_vcores, memory=client_memory), max_restarts=0, depends=["dask.scheduler"], files=files, env=client_env, script=build_script("services client %s %s" % (script_name, " ".join(args))), ) return spec
def test_proxy_user(client): hdfs = pytest.importorskip('pyarrow.hdfs') spec = skein.ApplicationSpec(name="test_proxy_user", user="******", services={ "service": skein.Service(resources=skein.Resources( memory=32, vcores=1), script="sleep infinity") }) with run_application(client, spec=spec) as app: spec2 = app.get_specification() client.kill_application(app.id, user="******") # Alice used throughout process assert spec2.user == 'alice' for fil in spec2.services['service'].files.values(): assert fil.source.startswith( 'hdfs://master.example.com:9000/user/alice') # Can get logs as user logs = get_logs(client, app.id, user="******") assert app.id in logs assert "application.master.log" in logs # Application directory deleted after kill fs = hdfs.connect() assert not fs.exists("/user/testuser/.skein/%s" % app.id)
def test_fail_on_container_failure(client, with_restarts): script = ('if [[ "$SKEIN_CONTAINER_ID" != "test_0" ]]; then\n' ' exit 1\n' 'else\n' ' sleep infinity\n' 'fi') spec = skein.ApplicationSpec( name="test_fail_on_container_failure", services={ 'test': skein.Service(instances=2, max_restarts=2 if with_restarts else 0, resources=skein.Resources(memory=32, vcores=1), script=script) }) with run_application(client, spec=spec) as app: wait_for_completion(client, app.id) == "FAILED" logs = get_logs(app.id) assert "test_0" in logs assert "test_1" in logs assert ("test_2" in logs) == with_restarts assert ("test_3" in logs) == with_restarts assert "test_4" not in logs
def test_node_locality(client, strict): if strict: relax_locality = False nodes = ['worker.example.com'] racks = [] else: relax_locality = True nodes = ['not.a.real.host.name'] racks = ['not.a.real.rack.name'] service = skein.Service( resources=skein.Resources(memory=128, vcores=1), script='sleep infinity', nodes=nodes, racks=racks, relax_locality=relax_locality ) spec = skein.ApplicationSpec(name="test_node_locality", queue="default", services={"service": service}) with run_application(client, spec=spec) as app: wait_for_containers(app, 1, states=['RUNNING']) spec2 = app.get_specification() app.shutdown() service2 = spec2.services['service'] assert service2.nodes == nodes assert service2.racks == racks assert service2.relax_locality == relax_locality
def test_security_specified(client): security = skein.Security.new_credentials() spec = skein.ApplicationSpec(name="test_security_specified", master=skein.Master(security=security), services={ 'sleeper': skein.Service(resources=skein.Resources( memory=128, vcores=1), commands=['sleep infinity']) }) with run_application(client, spec=spec) as app: assert app.security is security assert app.security != client.security spec2 = app.get_specification() app2 = client.connect(app.id, security=security) # Smoketest, can communicate app2.get_specification() app3 = client.connect(app.id) with pytest.raises(skein.ConnectionError): # Improper security credentials app3.get_specification() app.shutdown() remote_security = spec2.master.security assert remote_security.cert_bytes is None assert remote_security.key_bytes is None assert remote_security.cert_file.source.startswith('hdfs') assert remote_security.key_file.source.startswith('hdfs')
def test_master_driver_shutdown_sequence(kind, master_cmd, service_cmd, client, tmpdir): spec = skein.ApplicationSpec( name="test_master_driver_shutdown_sequence_%s" % kind, master=skein.Master(script=master_cmd), services={ 'service': skein.Service( resources=skein.Resources(memory=128, vcores=1), script=service_cmd ) } ) state = 'SUCCEEDED' if kind.endswith('succeeds') else 'FAILED' if kind == 'service_succeeds': with run_application(client, spec=spec) as app: wait_for_containers(app, 1, states=['SUCCEEDED']) assert len(app.get_containers()) == 0 # App hangs around until driver completes app.shutdown() assert wait_for_completion(client, app.id) == state else: with run_application(client, spec=spec, connect=False) as app_id: # service_fails results in immediate failure # driver_succeeds results in immediate success # driver_fails results in immediate failure assert wait_for_completion(client, app_id) == state
def test_client_errors_nicely_if_not_logged_in(security, not_logged_in): appid = 'application_1526134340424_0012' spec = skein.ApplicationSpec(name="should_never_get_to_run", queue="default", services={ 'service': skein.Service(resources=skein.Resources( memory=32, vcores=1), script='env') }) with skein.Client(security=security) as client: for func, args in [('get_applications', ()), ('get_nodes', ()), ('get_queue', ('default', )), ('get_child_queues', ('default', )), ('get_all_queues', ()), ('application_report', (appid, )), ('connect', (appid, )), ('move_application', (appid, 'default')), ('kill_application', (appid, )), ('submit', (spec, ))]: with pytest.raises(skein.DriverError) as exc: getattr(client, func)(*args) assert 'kinit' in str(exc.value)
def test_allow_failures_max_restarts(client, allow_failures): name = "test_max_restarts_allow_failures_%s" % str(allow_failures).lower() spec = skein.ApplicationSpec( name=name, master=skein.Master( script="sleep infinity" ), services={ 'myservice': skein.Service( instances=1, max_restarts=2, allow_failures=allow_failures, resources=skein.Resources(memory=128, vcores=1), script="exit 1" ) } ) with run_application(client, spec=spec) as app: if allow_failures: # Service failed 3 times, doesn't keep trying to run more wait_for_containers(app, 3, states=['FAILED']) # Check still running fine after 3 failures time.sleep(0.5) app.get_specification() # Shutdown manually app.shutdown() assert wait_for_completion(client, app.id) == 'SUCCEEDED' else: # Service failed 3 times and then terminates assert wait_for_completion(client, app.id) == 'FAILED'
def launch_remote_check(file: str) -> Tuple[bool, str]: logging.info('Launching remote check') zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER) archive_name = os.path.basename(zip_hdfs) with skein.Client() as client: files = { archive_name: zip_hdfs, 'check_hadoop_env.py': __file__, } editable_packages = cluster_pack.get_editable_requirements() if 'tf_yarn' in editable_packages: tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'], False) logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}") files.update({'tf_yarn': tf_yarn_zip}) service = skein.Service( script=f'./{archive_name} check_hadoop_env.py --file {file}', resources=skein.Resources(2 * 1024, 1), env={ 'PEX_ROOT': '/tmp/{uuid.uuid4()}/', 'PYTHONPATH': '.:', }, files=files, instances=1) spec = skein.ApplicationSpec( {'HADOOP_ENV_CHECKER': service}, acls=skein.model.ACLs(enable=True, view_users=['*']), ) app = client.submit_and_connect(spec) logging.info('Remote check started') result = app.kv.wait('result').decode() app_id = app.id app.shutdown() return result == "True", app_id
def test_memory_limit_exceeded(kind, client): resources = skein.Resources(memory=128, vcores=1) # Allocate noticeably more memory than the 128 MB limit script = 'python -c "b = bytearray(int(256e6)); import time; time.sleep(10)"' master = services = None if kind == 'master': master = skein.Master(resources=resources, script=script) search_txt = "memory limit" else: services = { 'service': skein.Service(resources=resources, script=script) } search_txt = "memory used" spec = skein.ApplicationSpec(name="test_memory_limit_exceeded_%s" % kind, queue="default", master=master, services=services) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == "FAILED" logs = get_logs(app_id) assert search_txt in logs if kind == 'master': report = client.application_report(app_id) assert 'memory limit' in report.diagnostics
def _make_submit_specification(script, args=(), **kwargs): spec = _make_specification(**kwargs) environment = lookup(kwargs, 'environment', 'yarn.environment') files, build_script = _files_and_build_script(environment) if 'dask.scheduler' in spec.services: # deploy_mode == 'remote' client_vcores = lookup(kwargs, 'client_vcores', 'yarn.client.vcores') client_memory = lookup(kwargs, 'client_memory', 'yarn.client.memory') client_env = lookup(kwargs, 'client_env', 'yarn.client.env') client_memory = parse_memory(client_memory, 'client') script_name = os.path.basename(script) files[script_name] = script spec.services['dask.client'] = skein.Service( instances=1, resources=skein.Resources(vcores=client_vcores, memory=client_memory), max_restarts=0, depends=['dask.scheduler'], files=files, env=client_env, script=build_script('services client %s %s' % (script_name, ' '.join(args)))) return spec
def _make_submit_specification(script, args=(), **kwargs): client_vcores = lookup(kwargs, 'client_vcores', 'yarn.client.vcores') client_memory = lookup(kwargs, 'client_memory', 'yarn.client.memory') client_env = lookup(kwargs, 'client_env', 'yarn.client.env') client_memory = parse_memory(client_memory, 'client') spec = _make_specification(**kwargs) environment = spec.services['dask.worker'].files['environment'] script_name = os.path.basename(script) spec.services['dask.client'] = skein.Service( instances=1, resources=skein.Resources(vcores=client_vcores, memory=client_memory), max_restarts=0, depends=['dask.scheduler'], files={ 'environment': environment, script_name: script }, env=client_env, commands=[ 'source environment/bin/activate', 'dask-yarn services client %s %s' % (script_name, ' '.join(args)) ]) return spec
def _setup_skein_cluster(pyenvs: Dict[NodeLabel, PythonEnvDescription], task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE, *, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, log_conf_file: str = None, standalone_client_mode: bool = False) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.termination_timeout_seconds >= 0: _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS", str(task_spec.termination_timeout_seconds)) services[task_type] = skein.Service( script=gen_task_cmd(pyenv, log_conf_file), resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) spec = skein.ApplicationSpec(services, queue=queue, acls=acls, file_systems=file_systems) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() cluster_spec = _setup_cluster_tasks(task_instances, app, standalone_client_mode) return SkeinCluster(skein_client, app, task_instances, cluster_spec, event_listener, events)
def test_file_systems(client): script = 'hdfs dfs -touchz /user/testuser/test_file_systems' service = skein.Service(resources=skein.Resources(memory=128, vcores=1), script=script) spec = skein.ApplicationSpec(name="test_file_systems", queue="default", services={'service': service}, file_systems=["hdfs://master.example.com:9000"]) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == 'SUCCEEDED'
def create_skein_app(): service = skein.Service(commands=['./example.pex distributed.py --server'], resources=skein.Resources(2 * 1024, 1), env={'PEX_ROOT': '/tmp/{uuid.uuid4()}/'}, files={ 'example.pex': 'example.pex', 'distributed.py': __file__ }, instances=2) spec = skein.ApplicationSpec({NODE_NAME: service}, queue='dev') return spec
def test_add_container(client): script = ('echo "$SKEIN_CONTAINER_ID - MYENV=$MYENV"\n' 'echo "$SKEIN_CONTAINER_ID - MYENV2=$MYENV2"\n' 'if [[ "$MYENV" == "bar" ]]; then\n' ' exit 1\n' 'else\n' ' exit 0\n' 'fi') spec = skein.ApplicationSpec(name="test_add_container", master=skein.Master(script="sleep infinity"), services={ 'test': skein.Service(instances=0, resources=skein.Resources( memory=32, vcores=1), env={ 'MYENV': 'foo', 'MYENV2': 'baz' }, max_restarts=1, script=script) }) with run_application(client, spec=spec) as app: # Add container with new overrides c = app.add_container('test') assert c.instance == 0 wait_for_containers(app, 1, states=['RUNNING', 'SUCCEEDED']) # Non-existant service with pytest.raises(ValueError): app.add_container('foobar') # Add container with override for MYENV c = app.add_container('test', {'MYENV': 'bar'}) assert c.instance == 1 # The new env var triggers a failure, should fail twice, # then fail the whole application assert wait_for_completion(client, app.id) == 'FAILED' logs = get_logs(app.id) assert "test_0 - MYENV=foo" in logs assert "test_0 - MYENV2=baz" in logs assert "test_1 - MYENV=bar" in logs assert "test_1 - MYENV2=baz" in logs assert "test_2 - MYENV=bar" in logs assert "test_2 - MYENV2=baz" in logs assert "test_3" not in logs
def test_file_systems(client): commands = ['hdfs dfs -touchz /user/testuser/test_file_systems'] service = skein.Service(resources=skein.Resources(memory=124, vcores=1), commands=commands) spec = skein.ApplicationSpec( name="test_file_systems", queue="default", services={'service': service}, file_systems=["hdfs://master.example.com:9000"]) with run_application(client, spec=spec) as app: wait_for_success(client, app.id)
def _build_specification(self, cluster, cert_path, key_path): files = { k: skein.File.from_dict(v) if isinstance(v, dict) else v for k, v in cluster.config.localize_files.items() } files["dask.crt"] = cert_path files["dask.pem"] = key_path scheduler_cmd = " ".join(self.get_scheduler_command(cluster)) worker_cmd = " ".join( self.get_worker_command( cluster, worker_name="$DASK_GATEWAY_WORKER_NAME", scheduler_address="$DASK_GATEWAY_SCHEDULER_ADDRESS", )) scheduler_script = f"{cluster.config.scheduler_setup}\n{scheduler_cmd}" worker_script = f"{cluster.config.worker_setup}\n{worker_cmd}" master = skein.Master( security=self._get_security(cluster), resources=skein.Resources( memory="%d b" % cluster.config.scheduler_memory, vcores=cluster.config.scheduler_cores, ), files=files, env=self.get_scheduler_env(cluster), script=scheduler_script, ) services = { "dask.worker": skein.Service( resources=skein.Resources( memory="%d b" % cluster.config.worker_memory, vcores=cluster.config.worker_cores, ), instances=0, max_restarts=0, allow_failures=True, files=files, env=self.get_worker_env(cluster), script=worker_script, ) } return skein.ApplicationSpec( name="dask-gateway", queue=cluster.config.queue, user=cluster.username, master=master, services=services, )
def test_set_log_level(client): service = skein.Service(resources=skein.Resources(memory=128, vcores=1), script='ls') spec = skein.ApplicationSpec(name="test_custom_log4j_properties", queue="default", master=skein.Master(log_level='debug'), services={'service': service}) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == 'SUCCEEDED' logs = get_logs(app.id) assert 'DEBUG' in logs
def test_memory_limit_exceeded(client): # Allocate noticeably more memory than the 128 MB limit service = skein.Service( resources=skein.Resources(memory=128, vcores=1), commands=[ 'python -c "b = bytearray(int(256e6)); import time; time.sleep(10)"' ]) spec = skein.ApplicationSpec(name="test_memory_limit_exceeded", queue="default", services={"service": service}) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == "FAILED" logs = get_logs(app.id) assert "memory used" in logs
def launch_skein(): with skein.Client() as client: service = skein.Service( resources=skein.model.Resources("1 GiB", 1), script=f''' set -x hdfs dfs -cat {filepath_on_hdfs} ''' ) spec = skein.ApplicationSpec(services={"service": service}) app_id = client.submit(spec) skein_launcher.wait_for_finished(client, app_id) logs = skein_launcher.get_application_logs(client, app_id, 2) for key, value in logs.items(): print(f"skein logs:{key} {value}")
def test_proxy_user_no_permissions(client): spec = skein.ApplicationSpec(name="test_proxy_user_no_permissions", user="******", services={ 'service': skein.Service(resources=skein.Resources( memory=128, vcores=1), commands=['env']) }) # No permission to submit as user with pytest.raises(skein.DriverError) as exc: client.submit(spec) exc_msg = str(exc.value) assert 'testuser' in exc_msg assert 'bob' in exc_msg
def test_custom_log4j_properties(client, tmpdir): configpath = str(tmpdir.join("log4j.properties")) service = skein.Service(resources=skein.Resources(memory=128, vcores=1), script='ls') spec = skein.ApplicationSpec(name="test_custom_log4j_properties", queue="default", master=skein.Master(log_config=configpath), services={'service': service}) with open(configpath, 'w') as f: f.write(custom_log4j_properties) with run_application(client, spec=spec) as app: assert wait_for_completion(client, app.id) == 'SUCCEEDED' logs = get_logs(app.id) assert 'CUSTOM-LOG4J-SUCCEEDED' in logs
def _build_specification(self, cluster_info, cert_path, key_path): files = { k: skein.File.from_dict(v) if isinstance(v, dict) else v for k, v in self.localize_files.items() } files["dask.crt"] = cert_path files["dask.pem"] = key_path env = self.get_env(cluster_info) scheduler_script = "\n".join( [self.scheduler_setup, self.scheduler_command]) worker_script = "\n".join([self.worker_setup, self.worker_command]) master = skein.Master( security=self._get_security(cluster_info), resources=skein.Resources(memory="%d b" % self.scheduler_memory, vcores=self.scheduler_cores), files=files, env=env, script=scheduler_script, ) services = { "dask.worker": skein.Service( resources=skein.Resources(memory="%d b" % self.worker_memory, vcores=self.worker_cores), instances=0, max_restarts=0, allow_failures=True, files=files, env=env, script=worker_script, ) } return skein.ApplicationSpec( name="dask-gateway", queue=self.queue, user=cluster_info.username, master=master, services=services, )
def test_proxy_user_no_permissions(client, hadoop3): if hadoop3: pytest.skip("Lack of proxyuser permissions causes " "yarnclient to hang in hadoop3") spec = skein.ApplicationSpec(name="test_proxy_user_no_permissions", user="******", services={ 'service': skein.Service(resources=skein.Resources( memory=32, vcores=1), script='env') }) # No permission to submit as user with pytest.raises(skein.DriverError) as exc: client.submit(spec) exc_msg = str(exc.value) assert 'testuser' in exc_msg assert 'bob' in exc_msg
def test_container_permissions(client, has_kerberos_enabled): commands = [ 'echo "USER_ENV=[$USER]"', 'echo "LOGIN_ID=[$(whoami)]"', 'hdfs dfs -touchz /user/testuser/test_container_permissions' ] service = skein.Service(resources=skein.Resources(memory=128, vcores=1), commands=commands) spec = skein.ApplicationSpec(name="test_container_permissions", queue="default", services={'service': service}) with run_application(client, spec=spec) as app: wait_for_success(app) logs = get_logs(app.app_id) assert "USER_ENV=[testuser]" in logs if has_kerberos_enabled: assert "LOGIN_ID=[testuser]" in logs else: assert "LOGIN_ID=[yarn]" in logs
def test_container_environment(runon, client, has_kerberos_enabled): script = ('set -e\n' 'env\n' 'echo "LOGIN_ID=[$(whoami)]"\n' 'hdfs dfs -touchz /user/testuser/test_container_permissions\n' 'yarn application -list') kwargs = dict(resources=skein.Resources(memory=512, vcores=1), script=script) services = master = None if runon == 'service': services = {'service': skein.Service(**kwargs)} else: master = skein.Master(**kwargs) spec = skein.ApplicationSpec(name="test_container_permissions_%s" % runon, queue="default", services=services, master=master) with run_application(client, spec=spec, connect=False) as app_id: assert wait_for_completion(client, app_id) == 'SUCCEEDED' logs = get_logs(app_id) assert "USER=testuser" in logs assert 'SKEIN_APPMASTER_ADDRESS=' in logs assert 'SKEIN_APPLICATION_ID=%s' % app_id in logs if runon == 'service': assert 'SKEIN_CONTAINER_ID=service_0' in logs assert 'SKEIN_RESOURCE_MEMORY=512' in logs assert 'SKEIN_RESOURCE_VCORES=1' in logs assert 'CLASSPATH' not in logs if has_kerberos_enabled: assert "LOGIN_ID=[testuser]" in logs assert "HADOOP_USER_NAME" not in logs else: assert "LOGIN_ID=[yarn]" in logs assert "HADOOP_USER_NAME" in logs
def _make_specification(**kwargs): """Create specification to run Dask Cluster This creates a ``skein.ApplicationSpec`` to run a dask cluster with the scheduler in a YARN container. See the docstring for ``YarnCluster`` for more details. """ if all(v is None for v in kwargs.values()) and dask.config.get("yarn.specification"): # No overrides and full specification in configuration spec = dask.config.get("yarn.specification") if isinstance(spec, dict): return skein.ApplicationSpec.from_dict(spec) return skein.ApplicationSpec.from_file(spec) deploy_mode = lookup(kwargs, "deploy_mode", "yarn.deploy-mode") if deploy_mode not in {"remote", "local"}: raise ValueError("`deploy_mode` must be one of {'remote', 'local'}, " "got %r" % deploy_mode) name = lookup(kwargs, "name", "yarn.name") queue = lookup(kwargs, "queue", "yarn.queue") tags = lookup(kwargs, "tags", "yarn.tags") user = lookup(kwargs, "user", "yarn.user") environment = lookup(kwargs, "environment", "yarn.environment") if environment is None: msg = ( "You must provide a path to a Python environment for the workers.\n" "This may be one of the following:\n" "- A conda environment archived with conda-pack\n" "- A virtual environment archived with venv-pack\n" "- A path to a conda environment, specified as conda://...\n" "- A path to a virtual environment, specified as venv://...\n" "- A path to a python binary to use, specified as python://...\n" "\n" "See http://yarn.dask.org/environments.html for more information.") raise ValueError(msg) n_workers = lookup(kwargs, "n_workers", "yarn.worker.count") worker_restarts = lookup(kwargs, "worker_restarts", "yarn.worker.restarts") worker_env = lookup(kwargs, "worker_env", "yarn.worker.env") worker_vcores = lookup(kwargs, "worker_vcores", "yarn.worker.vcores") worker_memory = parse_memory( lookup(kwargs, "worker_memory", "yarn.worker.memory"), "worker") services = {} files, build_script = _files_and_build_script(environment) if deploy_mode == "remote": scheduler_vcores = lookup(kwargs, "scheduler_vcores", "yarn.scheduler.vcores") scheduler_memory = parse_memory( lookup(kwargs, "scheduler_memory", "yarn.scheduler.memory"), "scheduler") services["dask.scheduler"] = skein.Service( instances=1, resources=skein.Resources(vcores=scheduler_vcores, memory=scheduler_memory), max_restarts=0, files=files, script=build_script("services scheduler"), ) worker_depends = ["dask.scheduler"] else: worker_depends = None services["dask.worker"] = skein.Service( instances=n_workers, resources=skein.Resources(vcores=worker_vcores, memory=worker_memory), max_restarts=worker_restarts, depends=worker_depends, files=files, env=worker_env, script=build_script("services worker"), ) spec = skein.ApplicationSpec(name=name, queue=queue, tags=tags, user=user, services=services) return spec