def _evaluator_fn(client): def _evaluate(stop): experiment = _task_commons._get_experiment(client) time.sleep(experiment.eval_spec.start_delay_secs) evaluated_checkpoints = set() while True: latest_checkpoint = experiment.estimator.latest_checkpoint() latest_eval_result = None if latest_checkpoint and latest_checkpoint not in evaluated_checkpoints: latest_eval_result = experiment.estimator.evaluate( experiment.eval_spec.input_fn, steps=experiment.eval_spec.steps, hooks=experiment.eval_spec.hooks, name=experiment.eval_spec.name ) if experiment.train_spec.max_steps: if latest_eval_result and latest_eval_result.status == _EvalStatus.EVALUATED: global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP) if global_step and global_step >= experiment.train_spec.max_steps: break else: if stop(): break time.sleep(experiment.eval_spec.throttle_secs) stop_evaluation = False thread = Thread(target=_evaluate, args=(lambda: stop_evaluation,), daemon=True) thread.start() event.wait(client, "chief:0/stop") stop_evaluation = True
def wait_for_connected_tasks(client, all_tasks, device_filters, message='stop'): for task in all_tasks: if matches_device_filters(task, device_filters): event.wait(client, f"{task}/{message}")
def create_cluster(): client = skein.ApplicationClient.from_current() cluster_spec = cluster.start_cluster(client, [f'{NODE_NAME}:0', f'{NODE_NAME}:1']) cluster.setup_tf_config(cluster_spec) cluster.start_tf_server(cluster_spec) event.wait(client, "stop")
def main() -> None: _task_commons._log_sys_info() task_type, task_id = cluster.get_task_description() task = cluster.get_task() client = skein.ApplicationClient.from_current() _task_commons._setup_container_logs(client) cluster_tasks = _task_commons._get_cluster_tasks(client) model_dir = os.getenv('TB_MODEL_DIR', "") if not model_dir: _logger.info("Read model_dir from estimator config") experiment = _task_commons._get_experiment(client) model_dir = experiment.estimator.config.model_dir _logger.info(f"Starting tensorboard on {model_dir}") thread = _internal.MonitoredThread(name=f"{task_type}:{task_id}", target=tensorboard.start_tf_board, args=(client, model_dir), daemon=True) thread.start() for cluster_task in cluster_tasks: event.wait(client, f"{cluster_task}/stop") timeout = tensorboard.get_termination_timeout() thread.join(timeout) event.stop_event(client, task, thread.exception) event.broadcast_container_stop_time(client, task)
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if isinstance(experiment, Experiment): if not is_chief(get_task_type(task)): # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps) elif isinstance(experiment, KerasExperiment): if not is_chief(get_task_type(task)): if experiment.train_params['callbacks'] is not None: callbacks_to_keep = [] for callback in experiment.train_params['callbacks']: if not isinstance(callback, tf.keras.callbacks.ModelCheckpoint): callbacks_to_keep.append(callback) experiment.train_params['callbacks'] = callbacks_to_keep if experiment.input_data_fn is not None: experiment.train_params['x'] = experiment.input_data_fn() if experiment.target_data_fn is not None: experiment.train_params['y'] = experiment.target_data_fn() logger.info("start training..") experiment.model.fit(**experiment.train_params) else: raise ValueError( "experiment must be an Experiment or a KerasExperiment")
def main() -> None: _task_commons._log_sys_info() task_type, task_id = cluster.get_task_description() with _internal.reserve_sock_addr() as host_port: client, cluster_spec, cluster_tasks = _task_commons._prepare_container(host_port) cluster.setup_tf_config(cluster_spec) tf_session_config = cloudpickle.loads(client.kv.wait(constants.KV_TF_SESSION_CONFIG)) _logger.info(f"tf_server_conf {tf_session_config}") tf.contrib.distribute.run_standard_tensorflow_server() event.wait(client, "stop")
def _setup_master(client: skein.ApplicationClient, rank: int) -> None: if rank == 0: with _internal.reserve_sock_addr() as host_port: event.broadcast(client, MASTER_ADDR, host_port[0]) event.broadcast(client, MASTER_PORT, str(host_port[1])) os.environ[MASTER_ADDR] = host_port[0] os.environ[MASTER_PORT] = str(host_port[1]) else: master_addr = event.wait(client, MASTER_ADDR) master_port = event.wait(client, MASTER_PORT) os.environ[MASTER_ADDR] = master_addr os.environ[MASTER_PORT] = master_port
def _driver_fn(client, net_if): cluster_tasks = _task_commons._get_cluster_tasks(client) # Worker discovery worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"] n_workers = 1 for cluster_task in cluster_tasks: if 'worker' in cluster_task: worker_addr = event.wait(client, f"{cluster_task}/addr") logger.info(f"{cluster_task}: {worker_addr}") worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}") n_workers += 1 # Worker task allocation to workers hosts = gloo_run.parse_hosts(','.join(worker_list)) host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers) for host in host_alloc_plan: host_info = f"""\ {host.rank},{host.size},{host.local_rank},\ {host.local_size},{host.cross_rank},{host.cross_size}\ """ event.broadcast(client, f"{get_task()}/{host.hostname}", host_info) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start_server() global_rendezv.httpd.init(host_alloc_plan) event.broadcast(client, f"{get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}") return global_rendezv.listen_thread
def aggregate_spec( client: skein.ApplicationClient, all_tasks: typing.List[str]) -> typing.Dict[str, typing.List[str]]: spec: typing.Dict[str, typing.List[str]] = {} for task in sorted(all_tasks, key=lambda x: int(x.split(':', 1)[1])): sock_addr = event.wait(client, f"{task}/init") task_type, _task_id = task.split(":", 1) spec.setdefault(task_type, []).append(sock_addr) return spec
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if task != 'chief:0': # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps)
def client_tf(client): spec = create_skein_app() app = client.submit_and_connect(spec) x = tf.placeholder(tf.float32, 100) with tf.device(f"/job:{NODE_NAME}/task:1"): first_batch = tf.slice(x, [0], [50]) mean1 = tf.reduce_mean(first_batch) with tf.device(f"/job:{NODE_NAME}/task:0"): second_batch = tf.slice(x, [50], [-1]) mean2 = tf.reduce_mean(second_batch) mean = (mean1 + mean2) / 2 first_task = event.wait(app, f"{NODE_NAME}:0/init") with tf.Session(f"grpc://{first_task}") as sess: result = sess.run(mean, feed_dict={x: np.random.random(100)}) print(f"mean = {result}") event.broadcast(app, "stop", "1")
def _setup_tracker(client): host_port = event.wait(client, "chief:0/tracker") tf.logging.info(f"Got tracker url {host_port}") host, port = host_port.split(":") os.environ['DMLC_TRACKER_URI'] = host os.environ['DMLC_TRACKER_PORT'] = port