def train_breast_cancer(config: dict, checkpoint_dir=None): # This is a simple training function to be passed into Tune # Load dataset data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) # Build input matrices for XGBoost train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Checkpointing needs to be set up in order for dynamic # resource allocation to work as intended xgb_model = None if checkpoint_dir: xgb_model = xgb.Booster() xgb_model.load_model(os.path.join(checkpoint_dir, CHECKPOINT_FILENAME)) # we can obtain current trial resources through # tune.get_trial_resources() config["nthread"] = int(tune.get_trial_resources().head_cpus) print(f"nthreads: {config['nthread']} xgb_model: {xgb_model}") # Train the classifier, using the Tune callback xgb.train( config, train_set, evals=[(test_set, "eval")], verbose_eval=False, xgb_model=xgb_model, callbacks=[ TuneReportCheckpointCallback( filename=CHECKPOINT_FILENAME, # checkpointing should happen every iteration # with dynamic resource allocation frequency=1, ) ], )
def train(config, checkpoint_dir=None): tune.report(metric=1, resources=tune.get_trial_resources())
def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = RayTuneSampler.decode_values(config, decode_ctx) trial_id = tune.get_trial_id() modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config) trial_dir = Path(tune.get_trial_dir()) trial_location = ray.util.get_node_ip_address() hyperopt_dict["config"] = modified_config hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def checkpoint(progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst) def report(progress_tracker): train_stats = { TRAINING: progress_tracker.train_metrics, VALIDATION: progress_tracker.vali_metrics, TEST: progress_tracker.test_metrics, } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address(): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() def on_epoch_end(self, trainer, progress_tracker, save_path): if is_using_ray_backend: save_path = Path(save_path) if trial_location != ray.util.get_node_ip_address(): sync_info = self._get_sync_client_and_remote_checkpoint_dir() if sync_info is not None: sync_client, remote_checkpoint_dir = sync_info sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial current_resources = resources.required_resources["GPU" if use_gpu else "CPU"] hvd_kwargs = { "num_workers": int(current_resources), "use_gpu": use_gpu, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir) if is_using_ray_backend and sync_info is not None: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() sync_client, remote_checkpoint_dir = sync_info def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )
def _run_experiment( self, config, checkpoint_dir, hyperopt_dict, decode_ctx, features_eligible_for_shared_params, is_using_ray_backend=False, ): for gpu_id in ray.get_gpu_ids(): # Previous trial may not have freed its memory yet, so wait to avoid OOM wait_for_gpu(gpu_id) # Some config values may be JSON encoded as strings, so decode them here config = self.decode_values(config, decode_ctx) # Remove mlflow injected config parameters: https://github.com/ludwig-ai/ludwig/issues/2288 if "mlflow" in config: del config["mlflow"] trial_id = tune.get_trial_id() trial_dir = Path(tune.get_trial_dir()) driver_trial_location = ray.util.get_node_ip_address() modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config, features_eligible_for_shared_params) hyperopt_dict["config"] = modified_config hyperopt_dict[ "experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["output_directory"] = str(trial_dir) tune_executor = self if is_using_ray_backend: ray_queue = RayQueue(actor_options={"num_cpus": 0}) else: ray_queue = None def report(progress_tracker): # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name -> # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint. # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune. train_stats = { TRAINING: metric_utils.reduce_trainer_metrics_dict( progress_tracker.train_metrics), VALIDATION: metric_utils.reduce_trainer_metrics_dict( progress_tracker.validation_metrics), TEST: metric_utils.reduce_trainer_metrics_dict( progress_tracker.test_metrics), } metric_score = tune_executor.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats="{}", trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), ) class RayTuneReportCallback(Callback): def __init__(self): super().__init__() self.last_steps = 0 def _get_remote_checkpoint_dir( self) -> Optional[Union[str, Tuple[str, str]]]: # sync client has to be recreated to avoid issues with serialization return tune_executor._get_remote_checkpoint_dir(trial_dir) def _checkpoint_progress(self, trainer, progress_tracker, save_path) -> None: """Checkpoints the progress tracker.""" if is_using_ray_backend: save_path = Path(save_path) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_up( str(save_path.parent.parent.absolute()), remote_checkpoint_dir) sync_client.wait_or_retry() ray_queue.put((progress_tracker, str(save_path))) return checkpoint(progress_tracker, save_path) def on_trainer_train_setup(self, trainer, save_path, is_coordinator): if is_using_ray_backend and checkpoint_dir and driver_trial_location != ray.util.get_node_ip_address( ): save_path = Path(save_path) for path in trial_dir.glob("checkpoint*"): if path not in (save_path.parent, checkpoint_dir): shutil.rmtree(path, ignore_errors=True) remote_checkpoint_dir = self._get_remote_checkpoint_dir() if remote_checkpoint_dir is not None: sync_client = tune_executor.sync_client sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) sync_client.wait_or_retry() def on_eval_end(self, trainer, progress_tracker, save_path): progress_tracker.tune_checkpoint_num += 1 self.last_steps = progress_tracker.steps self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) def on_trainer_train_teardown(self, trainer, progress_tracker, save_path, is_coordinator): if is_coordinator and progress_tracker.steps > self.last_steps: # Note: Calling tune.report in both on_eval_end() and here can cause multiprocessing issues # for some ray samplers if not steps have happened since the last eval. self._checkpoint_progress(trainer, progress_tracker, save_path) if not is_using_ray_backend: report(progress_tracker) callbacks = hyperopt_dict.get("callbacks") or [] hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()] # set tune resources if is_using_ray_backend: resources = tune.get_trial_resources() # check if we are using at least 1 gpu per trial use_gpu = bool(self._gpu_resources_per_trial_non_none) # get the resources assigned to the current trial num_gpus = resources.required_resources.get("GPU", 0) num_cpus = resources.required_resources.get( "CPU", 1) if num_gpus == 0 else 0 hvd_kwargs = { "num_workers": int(num_gpus) if use_gpu else 1, "use_gpu": use_gpu, "resources_per_worker": { "CPU": num_cpus, "GPU": 1 if use_gpu else 0, }, } hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs) logger.debug(f"Trial horovod kwargs: {hvd_kwargs}") stats = [] def _run(): train_stats, eval_stats = run_experiment( **hyperopt_dict, model_resume_path=checkpoint_dir, parameters=config, ) stats.append((train_stats, eval_stats)) if is_using_ray_backend: # We have to pull the results to the trial actor # from worker actors, as the Tune session is running # only on the trial actor thread = threading.Thread(target=_run) thread.daemon = True thread.start() if self.sync_config is not None: remote_checkpoint_dir = self._get_remote_checkpoint_dir( trial_dir) def check_queue(): qsize = ray_queue.qsize() if qsize: results = ray_queue.get_nowait_batch(qsize) if self.sync_client is not None: self.sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute())) self.sync_client.wait() for progress_tracker, save_path in results: checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path)))) report(progress_tracker) while thread.is_alive(): thread.join(timeout=0) check_queue() time.sleep(0.1) thread.join() check_queue() else: # remove threading overhead _run() if not stats: raise RuntimeError("Experiment did not complete.") train_stats, eval_stats = stats.pop() metric_score = self.get_metric_score(train_stats) tune.report( parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder), trial_id=tune.get_trial_id(), trial_dir=tune.get_trial_dir(), )