def main(): client = skein.ApplicationClient.from_current() task = cluster.get_task() task_type, task_id = cluster.get_task_description() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) if task_type == "evaluator": evaluator_fn(client) else: logger.info(f"{task_type}:{task_id}: nothing to do") event.stop_event(client, task, None)
def main(): client = skein.ApplicationClient.from_current() task_type, task_id = cluster.get_task_description() task = cluster.get_task() event.init_event(client, task, f"127.0.0.1:0") _task_commons._setup_container_logs(client) if task_type in ['chief', 'worker']: _worker_fn(task_type, task_id, client) elif task_type == 'evaluator': _evaluator_fn(client) else: logger.error(f'Unknown task type {task_type}') event.stop_event(client, task, None)
def start_cluster( host_port: typing.Tuple[str, int], client: skein.ApplicationClient, all_tasks: typing.List[str]) -> typing.Dict[str, typing.List[str]]: # There is a race condition between acquiring a TCP port for # ``tf.train.Server``, and calling ``train_and_evaluate``. # There is no TensorFlow API to get rid of the race condition # completely, but the window of opportunity can be reduced by # preempting the server. # See https://github.com/tensorflow/tensorflow/issues/21492 cluster_spec: typing.Dict = dict() host, port = host_port event.init_event(client, get_task(), f"{socket.gethostbyname(host)}:{port}") cluster_spec = aggregate_spec(client, all_tasks) return cluster_spec
def main(): client = skein.ApplicationClient.from_current() task_type, task_id = get_task_description() task = get_task() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) net_if = get_net_if() if task_type == 'chief': _driver_fn(client, net_if) if task_type in ['worker', 'chief']: _worker_fn(client, task, net_if) elif task_type == 'evaluator': evaluator_fn(client) else: logger.error(f'Unknown task type {task_type}') event.stop_event(client, task, None)