def _execute_and_await_termination( skein_cluster: SkeinCluster, serialized_fn: bytes, eval_monitor_log_thresholds: Dict[str, Tuple[float, float]] = None, n_try: int = 0, poll_every_secs: int = 10) -> Optional[metrics.Metrics]: skein_cluster.app.kv[constants.KV_EXPERIMENT_FN] = serialized_fn eval_metrics_logger = evaluator_metrics.EvaluatorMetricsLogger([ task for task in _internal.iter_tasks(skein_cluster.tasks) if task.startswith('evaluator') ], skein_cluster.app, eval_monitor_log_thresholds) tensorboard_url_event_name = tensorboard.url_event_name( _internal.iter_tasks(skein_cluster.tasks)) tensorboard_url_logger = metrics.OneShotMetricsLogger( skein_cluster.app, [(tensorboard_url_event_name, tensorboard.URL_EVENT_LABEL)] if tensorboard_url_event_name else [], n_try) state = None while True: report = skein_cluster.client.application_report(skein_cluster.app.id) logger.info( f"Application report for {skein_cluster.app.id} (state: {report.state})" ) if state != report.state: logger.info(_format_app_report(report)) if report.final_status != "undefined": skein_cluster.event_listener.join() log_events, result_metrics, container_status = _handle_events( skein_cluster.events, n_try) logger.info(log_events) containers = container_status.by_container_id() # add one for AM container wait_for_nb_logs = sum( [instances for task, instances in skein_cluster.tasks]) + 1 logs = _get_app_logs(skein_cluster.client, skein_cluster.app, wait_for_nb_logs) _save_logs_to_mlflow(logs, containers, n_try) if report.final_status == "failed": raise RunFailed else: break else: eval_metrics_logger.log() tensorboard_url_logger.log() time.sleep(poll_every_secs) state = report.state result_metrics.log_mlflow(n_try) return result_metrics
def _setup_cluster_spec( task_instances: List[Tuple[str, int]], app: skein.ApplicationClient ) -> tf.train.ClusterSpec: tasks_not_in_cluster = ['evaluator', 'tensorboard'] cluster_instances = [t for t in task_instances if t[0] not in tasks_not_in_cluster] app.kv[constants.KV_CLUSTER_INSTANCES] = json.dumps(cluster_instances).encode() return tf.train.ClusterSpec( cluster.aggregate_spec(app, list(_internal.iter_tasks(cluster_instances))) )
def _prepare_container( ) -> Tuple[skein.ApplicationClient, Dict[str, List[str]], List[str]]: tf.logging.info("Python " + sys.version) tf.logging.info("Skein " + skein.__version__) tf.logging.info(f"TensorFlow {tf.GIT_VERSION} {tf.VERSION}") client = skein.ApplicationClient.from_current() _setup_container_logs(client) cluster_tasks = list( iter_tasks(json.loads(client.kv.wait(KV_CLUSTER_INSTANCES).decode()))) cluster_spec = cluster.start_cluster(client, cluster_tasks) return client, cluster_spec, cluster_tasks
def _setup_cluster_spec( task_instances: List[Tuple[str, int]], app: skein.ApplicationClient, standalone_client_mode: bool ) -> tf.train.ClusterSpec: tasks_not_in_cluster = ['evaluator', 'tensorboard'] # In standalone client mode the chief is also not part of the cluster if standalone_client_mode: tasks_not_in_cluster.append('chief') cluster_instances = [t for t in task_instances if t[0] not in tasks_not_in_cluster] app.kv[constants.KV_CLUSTER_INSTANCES] = json.dumps(cluster_instances).encode() return tf.train.ClusterSpec( cluster.aggregate_spec(app, list(_internal.iter_tasks(cluster_instances))) )
def test__prepare_container(): with contextlib.ExitStack() as stack: # mock modules mocked_client_call = stack.enter_context( patch(f"{MODULE_TO_TEST}.skein.ApplicationClient.from_current")) mocked_logs = stack.enter_context(patch(f'{MODULE_TO_TEST}._setup_container_logs')) mocked_cluster_spec = stack.enter_context(patch(f'{MODULE_TO_TEST}.cluster.start_cluster')) # fill client mock mocked_client = mock.MagicMock(spec=skein.ApplicationClient) host_port = ('localhost', 1234) instances = [('worker', 10), ('chief', 1)] mocked_client.kv.wait.return_value = json.dumps(instances).encode() mocked_client_call.return_value = mocked_client (client, cluster_spec, cluster_tasks) = _prepare_container(host_port) # checks mocked_logs.assert_called_once() mocked_cluster_spec.assert_called_once_with(host_port, mocked_client, cluster_tasks) assert client == mocked_client assert cluster_tasks == list(iter_tasks(instances))
def _setup_skein_cluster( pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription], task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE, *, custom_task_module: Optional[str] = None, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, name: str = "RunOnYarn", n_try: int = 0, pre_script_hook: Optional[str] = None ) -> SkeinCluster: os.environ["JAVA_TOOL_OPTIONS"] = \ "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\ f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}" pre_script_hook = pre_script_hook if pre_script_hook else "" with tempfile.TemporaryDirectory() as tempdir: task_files, task_env = _setup_task_env(tempdir, files, env, n_try) services = {} for task_type, task_spec in list(task_specs.items()): pyenv = pyenvs[task_spec.label] service_env = task_env.copy() if task_spec.tb_termination_timeout_seconds >= 0: service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \ str(task_spec.tb_termination_timeout_seconds) if task_spec.tb_model_dir: service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir) if task_spec.tb_extra_args: service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args) services[task_type] = skein.Service( script=f''' set -x {pre_script_hook} {_env.gen_task_cmd( pyenv, task_type, custom_task_module)} ''', resources=skein.model.Resources(task_spec.memory, task_spec.vcores), max_restarts=0, instances=task_spec.instances, node_label=task_spec.label.value, files={ **task_files, pyenv.dest_path: pyenv.path_to_archive }, env=service_env) # on the cluster we don't ask again for delegation tokens if "HADOOP_TOKEN_FILE_LOCATION" in os.environ: file_systems = None spec = skein.ApplicationSpec( services, queue=queue, acls=acls, file_systems=file_systems, name=name ) if skein_client is None: skein_client = skein.Client() task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()] events: Dict[str, Dict[str, str]] = \ {task: {} for task in _internal.iter_tasks(task_instances)} app = skein_client.submit_and_connect(spec) # Start a thread which collects all events posted by all tasks in kv store event_listener = Thread(target=_aggregate_events, args=(app.kv, events)) event_listener.start() return SkeinCluster(skein_client, app, task_instances, event_listener, events)
def _get_cluster_tasks(client: skein.ApplicationClient) -> List[str]: return list( iter_tasks( json.loads( client.kv.wait(constants.KV_CLUSTER_INSTANCES).decode())))