def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict['config'] = modified_config hyperopt_dict[ 'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' tune_executor = self class RayTuneReportCallback(Callback): def on_epoch_end(self, trainer, progress_tracker, save_path): if trainer.is_coordinator(): with tune.checkpoint_dir( step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join( checkpoint_dir, 'model') shutil.copytree(save_path, checkpoint_model) train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics stats = eval_stats or train_stats metric_score = tune_executor.get_metric_score_from_eval_stats( stats)[-1] tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder)) train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, callbacks=[RayTuneReportCallback()], ) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config) trial_dir = Path(tune.get_trial_dir()) trial_location = ray.util.get_node_ip_address() hyperopt_dict["config"] = modified_config hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def checkpoint(progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst) def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address(): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() def on_epoch_end(self, trainer, progress_tracker, save_path): if is_using_ray_backend: save_path = Path(save_path) if trial_location != ray.util.get_node_ip_address(): sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial current_resources = resources.required_resources["GPU" if use_gpu else "CPU"] hvd_kwargs = { "num_workers": int(current_resources), "use_gpu": use_gpu, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir) if is_using_ray_backend and sync_info is not None: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() sync_client, remote_checkpoint_dir = sync_info def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict['config'] = modified_config hyperopt_dict[ 'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' tune_executor = self class RayTuneReportCallback(Callback): def on_epoch_end(self, trainer, progress_tracker, save_path): if trainer.is_coordinator(): with tune.checkpoint_dir( step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join( checkpoint_dir, 'model') # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except: shutil.rmtree(tmp_dst) train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score( train_stats, eval_stats=None) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats[TRAINING], cls=NumpyEncoder), eval_stats=json.dumps(train_stats[VALIDATION], cls=NumpyEncoder)) train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, callbacks=[RayTuneReportCallback()], ) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
def _run_experiment( self, config, checkpoint_dir, hyperopt_dict, decode_ctx, features_eligible_for_shared_params, is_using_ray_backend=False, ): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = self.decode_values(config, decode_ctx) # Remove mlflow injected config parameters: https://github.com/ludwig-ai/ludwig/issues/2288 if "mlflow" in config: del config["mlflow"] trial_id = tune.get_trial_id() trial_dir = Path(tune.get_trial_dir()) driver_trial_location = ray.util.get_node_ip_address() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config, features_eligible_for_shared_params) hyperopt_dict["config"] = modified_config hyperopt_dict[ "experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def report(progress_tracker): # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name -> # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint. # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune. train_stats = { TRAINING: metric_utils.reduce_trainer_metrics_dict( progress_tracker.train_metrics), VALIDATION: metric_utils.reduce_trainer_metrics_dict( progress_tracker.validation_metrics), TEST: metric_utils.reduce_trainer_metrics_dict( progress_tracker.test_metrics), } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def __init__(self): super().__init__() self.last_steps = 0 def _get_remote_checkpoint_dir( self) -> Optional[Union[str, Tuple[str, str]]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_remote_checkpoint_dir(trial_dir) def _checkpoint_progress(self, trainer, progress_tracker, save_path) -> None: """Checkpoints the progress tracker.""" if is_using_ray_backend: save_path = Path(save_path) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_up( str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait_or_retry() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and driver_trial_location != ray.util.get_node_ip_address( ): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait_or_retry() def on_eval_end(self, trainer, progress_tracker, save_path): progress_tracker.tune_checkpoint_num += 1 self.last_steps = progress_tracker.steps self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) def on_trainer_train_teardown(self, trainer, progress_tracker, save_path, is_coordinator): if is_coordinator and progress_tracker.steps > self.last_steps: # Note: Calling tune.report in both on_eval_end() and here can cause multiprocessing issues # for some ray samplers if not steps have happened since the last eval. self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial num_gpus = resources.required_resources.get("GPU", 0) num_cpus = resources.required_resources.get( "CPU", 1) if num_gpus == 0 else 0 hvd_kwargs = { "num_workers": int(num_gpus) if use_gpu else 1, "use_gpu": use_gpu, "resources_per_worker": { "CPU": num_cpus, "GPU": 1 if use_gpu else 0, }, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) if is_using_ray_backend: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() if self.sync_config is not None: remote_checkpoint_dir = self._get_remote_checkpoint_dir( trial_dir) def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) if self.sync_client is not None: self.sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) self.sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )