def main() -> None: _task_commons._log_sys_info() task_type, task_id = cluster.get_task_description() task = cluster.get_task() client = skein.ApplicationClient.from_current() _task_commons._setup_container_logs(client) cluster_tasks = _task_commons._get_cluster_tasks(client) model_dir = os.getenv('TB_MODEL_DIR', "") if not model_dir: _logger.info("Read model_dir from estimator config") experiment = _task_commons._get_experiment(client) model_dir = experiment.estimator.config.model_dir _logger.info(f"Starting tensorboard on {model_dir}") thread = _internal.MonitoredThread(name=f"{task_type}:{task_id}", target=tensorboard.start_tf_board, args=(client, model_dir), daemon=True) thread.start() for cluster_task in cluster_tasks: event.wait(client, f"{cluster_task}/stop") timeout = tensorboard.get_termination_timeout() thread.join(timeout) event.stop_event(client, task, thread.exception) event.broadcast_container_stop_time(client, task)
def _prepare_container( host_port: Tuple[str, int] ) -> Tuple[skein.ApplicationClient, Dict[str, List[str]], List[str]]: """Keep socket open while preparing container """ client = skein.ApplicationClient.from_current() _setup_container_logs(client) cluster_tasks = _get_cluster_tasks(client) cluster_spec = cluster.start_cluster(host_port, client, cluster_tasks) return client, cluster_spec, cluster_tasks
def main(): client = skein.ApplicationClient.from_current() task = cluster.get_task() task_type, task_id = cluster.get_task_description() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) if task_type == "evaluator": evaluator_fn(client) else: logger.info(f"{task_type}:{task_id}: nothing to do") event.stop_event(client, task, None)
def main(): client = skein.ApplicationClient.from_current() task_type, task_id = cluster.get_task_description() task = cluster.get_task() event.init_event(client, task, f"127.0.0.1:0") _task_commons._setup_container_logs(client) if task_type in ['chief', 'worker']: _worker_fn(task_type, task_id, client) elif task_type == 'evaluator': _evaluator_fn(client) else: logger.error(f'Unknown task type {task_type}') event.stop_event(client, task, None)
def main(): client = skein.ApplicationClient.from_current() task_type, task_id = get_task_description() task = get_task() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) net_if = get_net_if() if task_type == 'chief': _driver_fn(client, net_if) if task_type in ['worker', 'chief']: _worker_fn(client, task, net_if) elif task_type == 'evaluator': evaluator_fn(client) else: logger.error(f'Unknown task type {task_type}') event.stop_event(client, task, None)