def test_client_login_from_keytab(security, not_logged_in): with skein.Client(principal='testuser', keytab=KEYTAB_PATH, security=security) as client: # login worked client.get_applications() # Improper principal/keytab pair with pytest.raises(skein.DriverError): skein.Client(principal='not_the_right_user', keytab=KEYTAB_PATH, security=security) # Keytab file missing with pytest.raises(FileNotFoundError): skein.Client(principal='testuser', keytab='/not/a/real/path', security=security) # Must specify both principal and keytab with pytest.raises(ValueError): skein.Client(principal='testuser', security=security) with pytest.raises(ValueError): skein.Client(keytab=KEYTAB_PATH, security=security)
def test_client(security, kinit, tmpdir): logpath = str(tmpdir.join("log.txt")) with skein.Client(security=security, log=logpath) as client: # smoketests client.get_applications() repr(client) client2 = skein.Client(address=client.address, security=security) assert client2._proc is None # smoketests client2.get_applications() repr(client2) # Process was definitely closed assert not pid_exists(client._proc.pid) # no-op to call close again client.close() # Log was written assert os.path.exists(logpath) with open(logpath) as fil: assert len(fil.read()) > 0 # Connection error on closed client with pytest.raises(skein.ConnectionError): client2.get_applications() # Connection error on connecting to missing daemon with pytest.raises(skein.ConnectionError): skein.Client(address=client.address, security=security)
async def skein_client(principal=None, keytab=None): """Return a shared skein client object. Calls with the same principal & keytab will return the same client object (if one exists). """ key = (principal, keytab) client = _skein_client_cache.get(key) if client is None: kwargs = dict( principal=principal, keytab=keytab, security=skein.Security.new_credentials(), ) fut = get_running_loop().run_in_executor( None, lambda: skein.Client(**kwargs)) # Save the future first so any concurrent calls will wait on the same # future for generating the client _skein_client_cache[key] = fut client = await fut # Replace the future now that the operation is done _skein_client_cache[key] = client elif asyncio.isfuture(client): client = await client return client
def launch_remote_check(file: str) -> Tuple[bool, str]: logging.info('Launching remote check') zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER) archive_name = os.path.basename(zip_hdfs) with skein.Client() as client: files = { archive_name: zip_hdfs, 'check_hadoop_env.py': __file__, } editable_packages = cluster_pack.get_editable_requirements() if 'tf_yarn' in editable_packages: tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'], False) logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}") files.update({'tf_yarn': tf_yarn_zip}) service = skein.Service( script=f'./{archive_name} check_hadoop_env.py --file {file}', resources=skein.Resources(2 * 1024, 1), env={ 'PEX_ROOT': '/tmp/{uuid.uuid4()}/', 'PYTHONPATH': '.:', }, files=files, instances=1) spec = skein.ApplicationSpec( {'HADOOP_ENV_CHECKER': service}, acls=skein.model.ACLs(enable=True, view_users=['*']), ) app = client.submit_and_connect(spec) logging.info('Remote check started') result = app.kv.wait('result').decode() app_id = app.id app.shutdown() return result == "True", app_id
def test_client_errors_nicely_if_not_logged_in(security, not_logged_in): appid = 'application_1526134340424_0012' spec = skein.ApplicationSpec(name="should_never_get_to_run", queue="default", services={ 'service': skein.Service(resources=skein.Resources( memory=32, vcores=1), script='env') }) with skein.Client(security=security) as client: for func, args in [('get_applications', ()), ('get_nodes', ()), ('get_queue', ('default', )), ('get_child_queues', ('default', )), ('get_all_queues', ()), ('application_report', (appid, )), ('connect', (appid, )), ('move_application', (appid, 'default')), ('kill_application', (appid, )), ('submit', (spec, ))]: with pytest.raises(skein.DriverError) as exc: getattr(client, func)(*args) assert 'kinit' in str(exc.value)
def run(self): if self.run_on_yarn: # Dump job and base as local json files for yarn_launcher job_name = f"job-{self.config.name}.json" with open(job_name, "w") as file: json.dump(self.job, file, indent=4) # Launch job on yarn pex_path = self.config.upload_pex_cpu() with skein.Client() as skein_client: LOGGER.info(f"Submitting job {self.config.name}") app_id = submit( skein_client=skein_client, module_name="deepr.cli.main", additional_files=[job_name], archive_hdfs=pex_path, args=["from_config", job_name, "-", "run"], env_vars=self.config.get_env_vars(), hadoop_file_systems=self.config.hadoop_file_systems, memory=self.config.memory, name=self.config.name, num_cores=self.config.num_cores, ) report = skein_client.application_report(app_id) LOGGER.info(f"TRACKING_URL: {report.tracking_url}") mlflow.clear_run() else: LOGGER.info("Not running on yarn.") job = from_config(self.job) job.run()
def _get_skein_client(skein_client=None, security=None): if skein_client is None: # Silence warning about credentials not being written yet with warnings.catch_warnings(): warnings.simplefilter("ignore") return skein.Client(security=security) return skein_client
def _setup_skein_cluster(pyenvs: Dict[NodeLabel, PythonEnvDescription], task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE, *, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, log_conf_file: str = None, standalone_client_mode: bool = False) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.termination_timeout_seconds >= 0: _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS", str(task_spec.termination_timeout_seconds)) services[task_type] = skein.Service( script=gen_task_cmd(pyenv, log_conf_file), resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) spec = skein.ApplicationSpec(services, queue=queue, acls=acls, file_systems=file_systems) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() cluster_spec = _setup_cluster_tasks(task_instances, app, standalone_client_mode) return SkeinCluster(skein_client, app, task_instances, cluster_spec, event_listener, events)
def main(): tf.logging.set_verbosity(tf.logging.DEBUG) print(sys.argv) if '--server' in sys.argv: create_cluster() else: with skein.Client() as client: client_tf(client)
def _check_merged_logs(app_id, key_word, result_status): with skein.Client() as client: logs = skein_launcher.get_application_logs(client, app_id, 2) merged_logs = "" for key, value in logs.items(): merged_logs += value _logger.info(f"logs:{key} {value}") assert result_status assert key_word in merged_logs
def test_client_closed_when_reference_dropped(security, kinit): client = skein.Client(security=security, log=False) ref = weakref.ref(client) pid = client._proc.pid del client assert ref() is None assert not pid_exists(pid)
def stop(self, status='SUCCEEDED'): import skein try: skein_client = skein.Client() app_client = skein_client.connect(self._application_id) app_client.shutdown(status=status) if self._is_client_managed: self._skein_client.close() except skein.ApplicationNotRunningError: pass
def test_client_set_log_level(security, kinit, tmpdir): logpath = str(tmpdir.join("log.txt")) with skein.Client(security=security, log=logpath, log_level='debug') as client: # do an operation to ensure everything is working client.get_applications() with open(logpath) as fil: data = fil.read() assert 'DEBUG' in data
async def _get_client(self): key = (self.principal, self.keytab) client = type(self).clients.get(key) if client is None: kwargs = dict(principal=self.principal, keytab=self.keytab, security=skein.Security.new_credentials()) client = await gen.IOLoop.current().run_in_executor( None, lambda: skein.Client(**kwargs)) type(self).clients[key] = client return client
def test_client_starts_without_java_home(monkeypatch, tmpdir, security, kinit): monkeypatch.delenv('JAVA_HOME', raising=False) logpath = str(tmpdir.join("log.txt")) with skein.Client(security=security, log=logpath) as client: # do an operation to ensure everything is working client.get_applications() with open(logpath) as fil: data = fil.read() assert 'WARN' not in data assert 'native-hadoop' not in data
def launch_skein(): with skein.Client() as client: service = skein.Service( resources=skein.model.Resources("1 GiB", 1), script=f''' set -x hdfs dfs -cat {filepath_on_hdfs} ''' ) spec = skein.ApplicationSpec(services={"service": service}) app_id = client.submit(spec) skein_launcher.wait_for_finished(client, app_id) logs = skein_launcher.get_application_logs(client, app_id, 2) for key, value in logs.items(): print(f"skein logs:{key} {value}")
def test_client_forward_java_options(use_env, security, kinit, tmpdir, monkeypatch): logpath = str(tmpdir.join("log.txt")) if use_env: monkeypatch.setenv('SKEIN_DRIVER_JAVA_OPTIONS', '-Dskein.log.level=debug') kwargs = {} else: kwargs = {'java_options': ['-Dskein.log.level=debug']} with skein.Client(security=security, log=logpath, **kwargs) as client: # do an operation to ensure everything is working client.get_applications() with open(logpath) as fil: data = fil.read() assert 'DEBUG' in data
def _submit_and_await_app_master(func, assert_result_status=True, assert_log_content=None): with skein.Client() as client: log_output_path = f"hdfs:///tmp/{uuid.uuid4()}.log" app_id = skein_launcher.submit_func( client, func=func, args=[], memory="2 GiB", process_logs=functools.partial(skein_launcher.upload_logs_to_hdfs, log_output_path)) result = skein_launcher.wait_for_finished(client, app_id) fs, _ = filesystem.resolve_filesystem_and_path(log_output_path) with fs.open(log_output_path, "rb") as f: logs = f.read().decode() assert result == assert_result_status _logger.info(f"appmaster logs:\n{logs}") assert assert_log_content in logs
def _start_cluster(self, spec, skein_client=None): """Start the cluster and initialize state""" if skein_client is None: skein_client = skein.Client() app = skein_client.submit_and_connect(spec) try: scheduler_address = app.kv.wait('dask.scheduler').decode() except BaseException: # Failed to connect, kill the application and reraise skein_client.kill_application(app.id) raise # Ensure application gets cleaned up self._finalizer = weakref.finalize(self, app.shutdown) self.app_id = app.id self.application_client = app self.scheduler_address = scheduler_address
def status(app_id): report = skein.Client().application_report(app_id) header = ['application_id', 'name', 'state', 'status', 'containers', 'vcores', 'memory', 'runtime'] data = [(report.id, report.name, report.state, report.final_status, report.usage.num_used_containers, report.usage.used_resources.vcores, report.usage.used_resources.memory, humanize_timedelta(report.runtime))] print(format_table(header, data))
def from_application_id(cls, app_id, skein_client=None): """Connect to an existing ``YarnCluster`` with a given application id. Parameters ---------- app_id : str The existing cluster's application id. skein_client : skein.Client The ``skein.Client`` to use. If not provided, one will be started. Returns ------- YarnCluster """ self = super(YarnCluster, cls).__new__(cls) if skein_client is None: skein_client = skein.Client() app = skein_client.connect(app_id) self._connect_existing(app) return self
from cluster_pack.skein import skein_config_builder, skein_launcher if __name__ == "__main__": logging.basicConfig(level="INFO") package_path, _ = cluster_pack.upload_env() with tempfile.TemporaryDirectory() as tmp_dir: skein_config = skein_config_builder.build( module_name="skein_project.worker", package_path=package_path, tmp_dir=tmp_dir ) with skein.Client() as client: service = skein.Service( resources=skein.model.Resources("1 GiB", 1), files=skein_config.files, script=skein_config.script ) spec = skein.ApplicationSpec(services={"service": service}) app_id = client.submit(spec) skein_launcher.wait_for_finished(client, app_id) logs = skein_launcher.get_application_logs(client, app_id, 2) if logs: for key, value in logs.items(): print(f"skein logs:{key} {value}")
def new_cluster(environment=None, scheduler_num=1, scheduler_cpu=None, scheduler_mem=None, worker_num=1, worker_cpu=None, worker_mem=None, worker_spill_paths=None, worker_cache_mem=None, min_worker_num=None, web_num=1, web_cpu=None, web_mem=None, timeout=None, log_config=None, skein_client=None, app_name=None, **kwargs): import skein from .web import YarnWebApplication def _override_envs(src, updates): ret = src.copy() ret.update(updates) return ret app_name = app_name or f'mars-app-{uuid.uuid4()}' log_when_fail = kwargs.pop('log_when_fail', False) scheduler_extra_modules = kwargs.pop('scheduler_extra_modules', None) worker_extra_modules = kwargs.pop('worker_extra_modules', None) web_extra_modules = kwargs.pop('web_extra_modules', None) cmd_tmpl = kwargs.pop('cmd_tmpl', None) extra_envs = kwargs.pop('extra_env', dict()) scheduler_extra_env = _override_envs( extra_envs, kwargs.pop('scheduler_extra_env', dict())) worker_extra_env = _override_envs(extra_envs, kwargs.pop('worker_extra_env', dict())) web_extra_env = _override_envs(extra_envs, kwargs.pop('web_extra_env', dict())) extra_args = kwargs.pop('extra_args', '') scheduler_extra_args = (extra_args + ' ' + kwargs.pop('scheduler_extra_args', '')).strip() worker_extra_args = (extra_args + ' ' + kwargs.pop('worker_extra_args', '')).strip() web_extra_args = (extra_args + ' ' + kwargs.pop('web_extra_args', '')).strip() scheduler_log_config = kwargs.pop('scheduler_log_config', log_config) worker_log_config = kwargs.pop('worker_log_config', log_config) web_log_config = kwargs.pop('web_log_config', log_config) scheduler_config = MarsSchedulerConfig(instances=scheduler_num, environment=environment, cpu=scheduler_cpu, memory=scheduler_mem, modules=scheduler_extra_modules, env=scheduler_extra_env, log_config=scheduler_log_config, extra_args=scheduler_extra_args, cmd_tmpl=cmd_tmpl) worker_config = MarsWorkerConfig(instances=worker_num, environment=environment, cpu=worker_cpu, memory=worker_mem, spill_dirs=worker_spill_paths, worker_cache_mem=worker_cache_mem, modules=worker_extra_modules, env=worker_extra_env, log_config=worker_log_config, extra_args=worker_extra_args, cmd_tmpl=cmd_tmpl) web_config = MarsWebConfig(instances=web_num, environment=environment, cpu=web_cpu, memory=web_mem, modules=web_extra_modules, env=web_extra_env, log_config=web_log_config, extra_args=web_extra_args, cmd_tmpl=cmd_tmpl) app_config = MarsApplicationConfig(app_name, scheduler_config=scheduler_config, worker_config=worker_config, web_config=web_config) skein_client = skein_client or skein.Client() app_id = None try: is_client_managed = skein_client is not None app_id = skein_client.submit(app_config.build()) check_start_time = time.time() while True: try: app_client = skein_client.connect(app_id) break except skein.ApplicationNotRunningError: # pragma: no cover time.sleep(0.5) if time.time() - check_start_time > timeout: raise logger.debug('Application client for %s at %s retrieved', app_id, app_client.address) # wait until schedulers and expected num of workers are ready min_worker_num = int(min_worker_num or worker_num) limits = [scheduler_num, min_worker_num, web_num] services = [ MarsSchedulerConfig.service_name, MarsWorkerConfig.service_name, MarsWebConfig.service_name ] wait_services_ready( services, limits, lambda svc: _get_ready_container_count(app_client, svc), timeout=timeout - (time.time() - check_start_time)) web_endpoint_kv = app_client.kv.get_prefix( YarnWebApplication.service_name) web_endpoint = random.choice( [to_str(v).split('@', 1)[0] for v in web_endpoint_kv.values()]) return YarnClusterClient(skein_client, app_client.id, 'http://' + web_endpoint, is_client_managed=is_client_managed) except: # noqa: E722 skein_client = skein.Client() try: if log_when_fail: if app_id is not None: try: app_client = skein_client.connect(app_id) app_client.shutdown(status='FAILED') except skein.ApplicationNotRunningError: pass try: logs = skein_client.application_logs(app_id) logger.error('Error when creating cluster:\n%s', logs.dumps()) except ValueError: logger.error( 'Error when creating cluster and failed to get logs' ) else: logger.error( 'Error when creating cluster and no logs from cluster') finally: if app_id is not None: skein_client.kill_application(app_id) raise
def skein_client(security, kinit): with skein.Client(security=security) as skein_client: yield skein_client
def _setup_skein_cluster( pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription], task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE, *, custom_task_module: Optional[str] = None, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, name: str = "RunOnYarn", n_try: int = 0, pre_script_hook: Optional[str] = None ) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" pre_script_hook = pre_script_hook if pre_script_hook else "" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env, n_try) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.tb_termination_timeout_seconds >= 0: service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \ str(task_spec.tb_termination_timeout_seconds) if task_spec.tb_model_dir: service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir) if task_spec.tb_extra_args: service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args) services[task_type] = skein.Service( script=f''' set -x {pre_script_hook} {_env.gen_task_cmd( pyenv, task_type, custom_task_module)} ''', resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) # on the cluster we don't ask again for delegation tokens if "HADOOP_TOKEN_FILE_LOCATION" in os.environ: file_systems = None spec = skein.ApplicationSpec( services, queue=queue, acls=acls, file_systems=file_systems, name=name ) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in _internal.iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() return SkeinCluster(skein_client, app, task_instances, event_listener, events)
def skein_client(): with skein.Client() as client: yield client
print('Sent: %r' % message) data = await reader.read(100) print('Received: %r' % data.decode()) writer.close() async def echo_all(app, message): """Send and recieve a message from all running echo servers""" # Loop through all registered server addresses for address in app.kv.get_prefix('address.').values(): # Parse the host and port from the stored address host, port = address.decode().split(':') port = int(port) # Send the message to the echo server await tcp_echo_client(message, loop, host, port) # Get the application id from the command-line args app_id = sys.argv[1] # Connect to the application app = skein.Client().connect(app_id) # Send message to every running echo server loop = asyncio.get_event_loop() loop.run_until_complete(echo_all(app, 'Hello World!')) loop.close()
def new_cluster( environment=None, supervisor_num=1, supervisor_cpu=None, supervisor_mem=None, worker_num=1, worker_cpu=None, worker_mem=None, worker_spill_paths=None, worker_cache_mem=None, min_worker_num=None, timeout=None, log_config=None, skein_client=None, app_name=None, app_queue=None, **kwargs, ): import skein from .supervisor import YarnSupervisorCommandRunner def _override_envs(src, updates): ret = src.copy() ret.update(updates) return ret if worker_cpu is None or worker_mem is None: # pragma: no cover raise TypeError("`worker_cpu` and `worker_mem` must be specified") app_name = app_name or f"mars-app-{uuid.uuid4()}" supervisor_mem = calc_size_by_str(supervisor_mem, None) worker_mem = calc_size_by_str(worker_mem, None) log_when_fail = kwargs.pop("log_when_fail", False) supervisor_extra_modules = kwargs.pop("supervisor_extra_modules", None) worker_extra_modules = kwargs.pop("worker_extra_modules", None) cmd_tmpl = kwargs.pop("cmd_tmpl", None) extra_envs = kwargs.pop("extra_env", dict()) supervisor_extra_env = _override_envs( extra_envs, kwargs.pop("supervisor_extra_env", dict())) worker_extra_env = _override_envs(extra_envs, kwargs.pop("worker_extra_env", dict())) extra_args = kwargs.pop("extra_args", "") supervisor_extra_args = (extra_args + " " + kwargs.pop("supervisor_extra_args", "")).strip() worker_extra_args = (extra_args + " " + kwargs.pop("worker_extra_args", "")).strip() supervisor_log_config = kwargs.pop("supervisor_log_config", log_config) worker_log_config = kwargs.pop("worker_log_config", log_config) supervisor_config = MarsSupervisorConfig( instances=supervisor_num, environment=environment, cpu=supervisor_cpu, memory=supervisor_mem, modules=supervisor_extra_modules, env=supervisor_extra_env, log_config=supervisor_log_config, extra_args=supervisor_extra_args, cmd_tmpl=cmd_tmpl, ) worker_config = MarsWorkerConfig( instances=worker_num, environment=environment, cpu=worker_cpu, memory=worker_mem, spill_dirs=worker_spill_paths, worker_cache_mem=worker_cache_mem, modules=worker_extra_modules, env=worker_extra_env, log_config=worker_log_config, extra_args=worker_extra_args, cmd_tmpl=cmd_tmpl, ) app_config = MarsApplicationConfig( app_name, app_queue, supervisor_config=supervisor_config, worker_config=worker_config, ) skein_client = skein_client or skein.Client() app_id = None try: is_client_managed = skein_client is not None app_id = skein_client.submit(app_config.build()) check_start_time = time.time() while True: try: app_client = skein_client.connect(app_id) break except skein.ApplicationNotRunningError: # pragma: no cover time.sleep(0.5) if timeout and time.time() - check_start_time > timeout: raise logger.debug("Application client for %s at %s retrieved", app_id, app_client.address) # wait until supervisors and expected num of workers are ready min_worker_num = int(min_worker_num or worker_num) limits = [supervisor_num, min_worker_num] services = [ MarsSupervisorConfig.service_name, MarsWorkerConfig.service_name ] wait_services_ready( services, limits, lambda svc: _get_ready_container_count(app_client, svc), timeout=None if not timeout else timeout - (time.time() - check_start_time), ) web_endpoint_kv = app_client.kv.get_prefix( YarnSupervisorCommandRunner.web_service_name) web_endpoint = random.choice( [to_str(v).split("@", 1)[0] for v in web_endpoint_kv.values()]) return YarnClusterClient( skein_client, app_client.id, web_endpoint, is_client_managed=is_client_managed, ) except: # noqa: E722 skein_client = skein.Client() try: if log_when_fail: if app_id is not None: try: app_client = skein_client.connect(app_id) app_client.shutdown(status="FAILED") except skein.ApplicationNotRunningError: pass try: logs = skein_client.application_logs(app_id) logger.error("Error when creating cluster:\n%s", logs.dumps()) except ValueError: logger.error( "Error when creating cluster and failed to get logs" ) else: logger.error( "Error when creating cluster and no logs from cluster") finally: if app_id is not None: skein_client.kill_application(app_id) raise
def setup_skein_cluster(self, task_specs: Dict[str, TaskSpec] = TASK_SPEC_NONE, *, files: Dict[str, str] = None, env: Dict[str, str] = {}, log_conf_file: str = None) -> SkeinCluster: """Request a cluster on YARN with Skein. The implementation allocates a service with the requested number of instances for each distributed TensorFlow task type. Each instance expects a serialized run_config to setup the tensorflow servers and an experiment function to execute. Parameters ---------- task_specs Resources to allocate for each task type. The keys must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and ``"evaluator"``. The minimal spec must contain at least ``"chief"``. files Local files or directories to upload to the container. The keys are the target locations of the resources relative to the container root, while the values -- their corresponding local sources. Note that container root is appended to ``PYTHONPATH``. Therefore, any listed Python module a package is automatically importable. env Environment variables to forward to the containers. log_conf_file optional file with log config, setups logging by default with INFO verbosity, if you specify a file here don't forget to also ship it to the containers via files arg """ os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = self.pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.termination_timeout_seconds >= 0: _add_to_env(service_env, "SERVICE_TERMINATION_TIMEOUT_SECONDS", str(task_spec.termination_timeout_seconds)) services[task_type] = skein.Service( commands=[gen_task_cmd(pyenv, log_conf_file)], resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) spec = skein.ApplicationSpec(services, queue=self.queue, file_systems=self.file_systems) try: client = skein.Client.from_global_daemon() except skein.exceptions.DaemonNotRunningError: client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in iter_tasks(task_instances)} app = client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() cluster_spec = _setup_cluster_tasks(task_instances, app) return SkeinCluster(client, app, task_instances, cluster_spec, event_listener, events)
def client(security, kinit): with skein.Client(security=security) as client: yield client