def write_checkpoint(self, checkpoint: Dict): # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir( step=self._latest_checkpoint_id) as checkpoint_dir: source_ip = checkpoint[NODE_IP_KEY] source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: # Move contents of source_path, but not source_path # itself. shutil.move is already recursive. for path in Path(source_path).iterdir(): shutil.move(str(path.absolute()), checkpoint_dir) shutil.rmtree(source_path, ignore_errors=True) else: sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_dir, return_futures=False, max_size_bytes=None, ) delete_on_node(node_ip=source_ip, path=source_path) checkpoint_dir = Path(checkpoint_dir) save_preprocessor_to_dir(self.preprocessor, checkpoint_dir) # add tune checkpoint id with open(checkpoint_dir.joinpath(TUNE_CHECKPOINT_ID), "w") as f: f.write(str(self._latest_checkpoint_id))
def commit(self, path: Optional[Path] = None) -> None: if (self.storage_mode == CheckpointStorage.MEMORY or not path or not isinstance(self.dir_or_data, dict)): return source_ip = self.dir_or_data[NODE_IP_KEY] source_path = self.dir_or_data[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: # Move contents of source_path, but not source_path # itself. shutil.move is already recursive. for inner in Path(source_path).iterdir(): shutil.move(str(inner.absolute()), str(path)) shutil.rmtree(source_path, ignore_errors=True) else: sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=str(path), return_futures=False, max_size_bytes=None, ) delete_on_node(node_ip=source_ip, path=source_path) save_preprocessor_to_dir(self.dir_or_data.pop(PREPROCESSOR_KEY, None), path) # add tune checkpoint id with open(path.joinpath(TUNE_CHECKPOINT_ID), "w") as f: f.write(str(self.id))
def _convert_directory_checkpoint_to_sync_if_needed( self, checkpoint: Checkpoint) -> Checkpoint: """Replace the directory checkpoint with a node ip & path dict checkpoint. This dict checkpoint will be used to sync the directory. If we were to use a directory checkpoint directly, it would get deepcopied & serialized unnecessarily.""" with checkpoint.as_directory() as checkpoint_path: # Load checkpoint from path. checkpoint_path = Path(checkpoint_path).expanduser().absolute() if not checkpoint_path.joinpath(TUNE_CHECKPOINT_ID).exists(): # If the ID file is missing, we assume that this is already # a sync checkpoint dict_checkpoint = checkpoint.to_dict() if (NODE_IP_KEY not in dict_checkpoint or CHECKPOINT_PATH_ON_NODE_KEY not in dict_checkpoint): raise ValueError( "Wrong checkpoint format. Ensure the checkpoint is a " "result of `HuggingFaceTrainer`.") return checkpoint with open(checkpoint_path.joinpath(TUNE_CHECKPOINT_ID), "r") as f: tune_checkpoint_id = int(f.read()) return Checkpoint.from_dict({ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), TUNE_CHECKPOINT_ID: tune_checkpoint_id, })
def deploy_ray_func(func, *args): # pragma: no cover """ Execute a function on an axis partition in a worker process. Parameters ---------- func : callable Function to be executed on an axis partition. *args : iterable Additional arguments that need to passed in ``func``. Returns ------- list The result of the function ``func`` and metadata for it. Notes ----- Ray functions are not detected by codecov (thus pragma: no cover). """ result = func(*args) ip = get_node_ip_address() if isinstance(result, pandas.DataFrame): return result, len(result), len(result.columns), ip elif all(isinstance(r, pandas.DataFrame) for r in result): return [i for r in result for i in [r, len(r), len(r.columns), ip]] else: return [i for r in result for i in [r, None, None, ip]]
def _process_events(self, timeout: Optional[float] = None): with warn_if_slow("get_next_failed_trial"): failed_trial = self.trial_executor.get_next_failed_trial() if failed_trial: error_msg = ( "{} (IP: {}) detected as stale. This is likely because the " "node was lost").format(failed_trial, failed_trial.node_ip) logger.info(error_msg) with warn_if_slow("process_failed_trial"): self._process_trial_failure(failed_trial, error_msg=error_msg) else: # TODO(ujvl): Consider combining get_next_available_trial and # fetch_result functionality so that we don't timeout on fetch. trial = self.trial_executor.get_next_available_trial( timeout=timeout) # blocking if not trial: return if trial.is_restoring: with warn_if_slow("process_trial_restore"): self._process_trial_restore(trial) with warn_if_slow("callbacks.on_trial_restore"): self._callbacks.on_trial_restore(iteration=self._iteration, trials=self._trials, trial=trial) elif trial.is_saving: with warn_if_slow("process_trial_save") as _profile: self._process_trial_save(trial) with warn_if_slow("callbacks.on_trial_save"): self._callbacks.on_trial_save(iteration=self._iteration, trials=self._trials, trial=trial) if _profile.too_slow and trial.sync_on_checkpoint: # TODO(ujvl): Suggest using DurableTrainable once # API has converged. msg = ( "Consider turning off forced head-worker trial " "checkpoint syncs by setting sync_on_checkpoint=False" ". Note that this may result in faulty trial " "restoration if a failure occurs while the checkpoint " "is being synced from the worker to the head node.") if trial.location.hostname and (trial.location.hostname != get_node_ip_address()): if log_once("tune_head_worker_checkpoint"): logger.warning(msg) else: with warn_if_slow("process_trial"): self._process_trial(trial) # `self._queued_trial_decisions` now contains a final decision # based on all results if trial not in self._cached_trial_decisions: final_decision = self._queued_trial_decisions.pop( trial.trial_id, None) if final_decision: self._execute_action(trial, final_decision)
def apply_list_of_funcs(funcs, partition): # pragma: no cover """ Execute all operations stored in the call queue on the partition in a worker process. Parameters ---------- funcs : list A call queue that needs to be executed on the partition. partition : pandas.DataFrame A pandas DataFrame the call queue needs to be executed on. Returns ------- pandas.DataFrame The resulting pandas DataFrame. int The number of rows of the resulting pandas DataFrame. int The number of columns of the resulting pandas DataFrame. str The node IP address of the worker process. """ def deserialize(obj): if isinstance(obj, ObjectIDType): return ray.get(obj) elif isinstance(obj, (tuple, list)) and any( isinstance(o, ObjectIDType) for o in obj ): return ray.get(list(obj)) elif isinstance(obj, dict) and any( isinstance(val, ObjectIDType) for val in obj.values() ): return dict(zip(obj.keys(), ray.get(list(obj.values())))) else: return obj for func, args, kwargs in funcs: func = deserialize(func) args = deserialize(args) kwargs = deserialize(kwargs) try: partition = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: partition = func(partition.copy(), *args, **kwargs) return ( partition, len(partition) if hasattr(partition, "__len__") else 0, len(partition.columns) if hasattr(partition, "columns") else 0, get_node_ip_address(), )
def on_save(self, args, state, control, **kwargs): # Save is called after evaluation. checkpoint_path = Path( transformers.trainer.get_last_checkpoint( args.output_dir)).absolute() if checkpoint_path: train.save_checkpoint( **{ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), })
def __init__(self, local_dir: str, remote_dir: str, sync_client: Optional[SyncClient] = None): configure_logging(log_style="record", verbosity=env_integer("TUNE_SYNCER_VERBOSITY", 0)) self.local_ip = get_node_ip_address() self.worker_ip = None sync_client = sync_client or DockerSyncClient() sync_client.configure(self._cluster_config_file) super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
def __init__(self, local_dir: str, remote_dir: str, sync_client: Optional[SyncClient] = None): if not kubernetes: raise ImportError( "kubernetes is not installed on this machine/container. " "Try: pip install kubernetes") self.local_ip = get_node_ip_address() self.local_node = self._get_kubernetes_node_by_ip(self.local_ip) self.worker_ip = None self.worker_node = None sync_client = sync_client or KubernetesSyncClient( namespace=self.__class__._namespace) super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
def after_iteration(self, model, epoch, evals_log): if self._allow_ips and get_node_ip_address() not in self._allow_ips: return if epoch == self._iteration: rank = get_actor_rank() if rank in self._ranks: if not ray.get(self._state.has_failed.remote(self._id)): success = ray.get(self._state.set_failed.remote(self._id)) if not success: # Another rank is already about to fail return pid = os.getpid() print(f"Killing process: {pid} for actor rank {rank}") time.sleep(1) os.kill(pid, 9)
def apply_func(partition, func, *args, **kwargs): # pragma: no cover """ Execute a function on the partition in a worker process. Parameters ---------- partition : pandas.DataFrame A pandas DataFrame the function needs to be executed on. func : callable Function that needs to be executed on the partition. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- pandas.DataFrame The resulting pandas DataFrame. int The number of rows of the resulting pandas DataFrame. int The number of columns of the resulting pandas DataFrame. str The node IP address of the worker process. """ try: result = func(partition, *args, **kwargs) # Sometimes Arrow forces us to make a copy of an object before we operate on it. We # don't want the error to propagate to the user, and we want to avoid copying unless # we absolutely have to. except ValueError: result = func(partition.copy(), *args, **kwargs) return ( result, len(result) if hasattr(result, "__len__") else 0, len(result.columns) if hasattr(result, "columns") else 0, get_node_ip_address(), )
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(train.world_rank()) os.environ["WORLD_SIZE"] = str(train.world_size()) os.environ["LOCAL_RANK"] = str(train.local_rank()) train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = session.get_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: assert isinstance(checkpoint, Checkpoint) checkpoint_dict = checkpoint.to_dict() source_ip = checkpoint_dict[NODE_IP_KEY] source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def before_training(self, model): ip_address = get_node_ip_address() put_queue(ip_address) return model
def __init__(self, local_dir, remote_dir, sync_client): self.local_ip = get_node_ip_address() self.worker_ip = None super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
def _train( dtrain, params: Dict, *args, num_actors=None, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray engine. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` by nodes, part of partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- dtrain : modin.experimental.DMatrix Data to be trained against. params : dict Booster params. *args : iterable Other parameters for `xgboost.train`. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will be evaluated during training. Validation metrics will help us track the performance of the model. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- dict A dictionary with trained booster and dict of evaluation results as {"booster": xgboost.Booster, "history": dict}. """ s = time.time() X_row_parts, y_row_parts = dtrain dmatrix_kwargs = dtrain.get_dmatrix_params() assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" num_actors = _get_num_actors(num_actors) if num_actors > len(X_row_parts): num_actors = len(X_row_parts) if evals: min_num_parts = num_actors for (eval_X, _), eval_method in evals: if len(eval_X) < min_num_parts: min_num_parts = len(eval_X) method_name = eval_method if num_actors != min_num_parts: num_actors = min_num_parts warnings.warn( f"`num_actors` is set to {num_actors}, because `evals` data with name `{method_name}` has only {num_actors} partition(s)." ) actors = create_actors(num_actors) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method, **dmatrix_kwargs), eval_X, eval_y, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method, **dmatrix_kwargs), X_row_parts, y_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for _, actor in actors ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) LOGGER.info(f"Training time: {time.time() - s} s") return result