def _get_experiment(client: skein.ApplicationClient) -> Experiment: try: experiment = dill.loads(client.kv.wait(KV_EXPERIMENT_FN))() except Exception as e: task = cluster.get_task() event.start_event(client, task) event.stop_event(client, task, e) raise return experiment
def _get_experiment(client: skein.ApplicationClient) -> NamedTuple: try: experiment = cloudpickle.loads( client.kv.wait(constants.KV_EXPERIMENT_FN))() except Exception as e: task = get_task() event.start_event(client, task) event.stop_event(client, task, e) raise return experiment
def _execute_dispatched_function(client: skein.ApplicationClient, experiment: Experiment) -> MonitoredThread: task_type, task_id = cluster.get_task_description() _logger.info(f"Starting execution {task_type}:{task_id}") thread = MonitoredThread(name=f"{task_type}:{task_id}", target=_gen_monitored_train_and_evaluate(client), args=tuple(experiment), daemon=True) thread.start() task = cluster.get_task() event.start_event(client, task) return thread
def _execute_dispatched_function( client: skein.ApplicationClient, experiment: Union[Experiment, KerasExperiment]) -> MonitoredThread: task_type, task_id = get_task_description() _logger.info(f"Starting execution {task_type}:{task_id}") if isinstance(experiment, Experiment): thread = MonitoredThread( name=f"{task_type}:{task_id}", target=_gen_monitored_train_and_evaluate(client), args=tuple(experiment), daemon=True) elif isinstance(experiment, KerasExperiment): raise ValueError( "KerasExperiment using parameter strategy is unsupported") else: raise ValueError( "experiment must be an Experiment or a KerasExperiment") thread.start() task = get_task() event.start_event(client, task) return thread
def start_tf_board(client: skein.ApplicationClient, tf_board_model_dir: str): task = cluster.get_task() os.environ['GCS_READ_CACHE_DISABLED'] = '1' os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'cpp' os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION'] = '2' try: program.setup_environment() tensorboard = program.TensorBoard() with _internal.reserve_sock_addr() as (h, p): tensorboard_url = f"http://{h}:{p}" argv = ['tensorboard', f"--logdir={tf_board_model_dir}", f"--port={p}"] tb_extra_args = os.getenv('TB_EXTRA_ARGS', "") if tb_extra_args: argv += tb_extra_args.split(' ') tensorboard.configure(argv) tensorboard.launch() event.start_event(client, task) event.url_event(client, task, f"{tensorboard_url}") except Exception as e: _logger.error("Cannot start tensorboard", e) event.stop_event(client, task, e)