class PyTorchRayEstimator(OrcaRayEstimator): def __init__(self, *, model_creator, optimizer_creator, loss_creator=None, metrics=None, scheduler_creator=None, training_operator_cls=TrainingOperator, initialization_hook=None, config=None, scheduler_step_freq="batch", use_tqdm=False, backend="torch_distributed", workers_per_node=1): if config is not None and "batch_size" in config: raise Exception( "Please do not specify batch_size in config. Input batch_size in the" " fit/evaluate/predict function of the estimator instead.") from zoo.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator self.estimator = PyTorchRayEstimator( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, metrics=metrics, scheduler_creator=scheduler_creator, training_operator_cls=training_operator_cls, initialization_hook=initialization_hook, config=config, scheduler_step_freq=scheduler_step_freq, use_tqdm=use_tqdm, backend=backend, workers_per_node=workers_per_node) def fit(self, data, epochs=1, batch_size=32, profile=False, reduce_results=True, info=None, feature_cols=None, label_cols=None): """ Trains a PyTorch model given training data for several epochs. Calls `TrainingOperator.train_epoch()` on N parallel workers simultaneously underneath the hood. :param data: An instance of SparkXShards, a Spark DataFrame or a function that takes config and batch_size as argument and returns a PyTorch DataLoader for training. :param epochs: The number of epochs to train the model. Default is 1. :param batch_size: The number of samples per batch for each worker. Default is 32. The total batch size would be workers_per_node*num_nodes. If your training data is a function, you can set batch_size to be the input batch_size of the function for the PyTorch DataLoader. :param profile: Boolean. Whether to return time stats for the training procedure. Default is False. :param reduce_results: Boolean. Whether to average all metrics across all workers into one dict. If a metric is a non-numerical value (or nested dictionaries), one value will be randomly selected among the workers. If False, returns a list of dicts for all workers. Default is True. :param info: An optional dictionary that can be passed to the TrainingOperator for train_epoch and train_batch. :param feature_cols: feature column names if data is Spark DataFrame. :param label_cols: label column names if data is Spark DataFrame. :return A list of dictionary of metrics for every training epoch. If reduce_results is False, this will return a nested list of metric dictionaries whose length will be equal to the total number of workers. You can also provide custom metrics by passing in a custom training_operator_cls when creating the Estimator. """ return self.estimator.train(data=data, epochs=epochs, batch_size=batch_size, profile=profile, reduce_results=reduce_results, info=info, feature_cols=feature_cols, label_cols=label_cols) def predict(self, data, batch_size=32, feature_cols=None, profile=False): """ Using this PyTorch model to make predictions on the data. :param data: An instance of SparkXShards or a Spark DataFrame :param batch_size: The number of samples per batch for each worker. Default is 32. :param profile: Boolean. Whether to return time stats for the training procedure. Default is False. :param feature_cols: feature column names if data is a Spark DataFrame. :return A SparkXShards that contains the predictions with key "prediction" in each shard """ return self.estimator.predict(data, batch_size=batch_size, feature_cols=feature_cols, profile=profile) def evaluate(self, data, batch_size=32, num_steps=None, profile=False, info=None, feature_cols=None, label_cols=None): """ Evaluates a PyTorch model given validation data. Note that only accuracy for classification with zero-based label is supported by default. You can override validate_batch in TrainingOperator for other metrics. Calls `TrainingOperator.validate()` on N parallel workers simultaneously underneath the hood. :param data: An instance of SparkXShards, a Spark DataFrame or a function that takes config and batch_size as argument and returns a PyTorch DataLoader for validation. :param batch_size: The number of samples per batch for each worker. Default is 32. The total batch size would be workers_per_node*num_nodes. If your validation data is a function, you can set batch_size to be the input batch_size of the function for the PyTorch DataLoader. :param num_steps: The number of batches to compute the validation results on. This corresponds to the number of times `TrainingOperator.validate_batch` is called. :param profile: Boolean. Whether to return time stats for the training procedure. Default is False. :param info: An optional dictionary that can be passed to the TrainingOperator for validate. :param feature_cols: feature column names if train data is Spark DataFrame. :param label_cols: label column names if train data is Spark DataFrame. :return A dictionary of metrics for the given data, including validation accuracy and loss. You can also provide custom metrics by passing in a custom training_operator_cls when creating the Estimator. """ return self.estimator.validate(data=data, batch_size=batch_size, num_steps=num_steps, profile=profile, info=info, feature_cols=feature_cols, label_cols=label_cols) def get_model(self): """ Returns the learned PyTorch model. :return: The learned PyTorch model. """ return self.estimator.get_model() def save(self, model_path): """ Saves the Estimator state (including model and optimizer) to the provided model_path. :param model_path: (str) Path to save the model. :return: """ return self.estimator.save(model_path) def load(self, model_path): """ Loads the Estimator state (including model and optimizer) from the provided model_path. :param model_path: (str) Path to the existing model. """ return self.estimator.load(model_path) def shutdown(self, force=False): """ Shuts down workers and releases resources. :return: """ return self.estimator.shutdown(force=force)
def from_torch(*, model, optimizer, loss=None, metrics=None, scheduler_creator=None, training_operator_cls=TrainingOperator, initialization_hook=None, config=None, scheduler_step_freq="batch", use_tqdm=False, workers_per_node=1, model_dir=None, backend="bigdl"): """ Create an Estimator for torch. :param model: PyTorch model or model creator function if backend="bigdl", PyTorch model creator function if backend="horovod" or "torch_distributed" :param optimizer: Orca/PyTorch optimizer or optimizer creator function if backend="bigdl" , PyTorch optimizer creator function if backend="horovod" or "torch_distributed" :param loss: PyTorch loss or loss creator function if backend="bigdl", PyTorch loss creator function if backend="horovod" or "torch_distributed" :param metrics: Orca validation methods for evaluate. :param scheduler_creator: parameter for `horovod` and `torch_distributed` backends. a learning rate scheduler wrapping the optimizer. You will need to set ``scheduler_step_freq="epoch"`` for the scheduler to be incremented correctly. :param config: parameter config dict to create model, optimizer loss and data. :param scheduler_step_freq: parameter for `horovod` and `torch_distributed` backends. "batch", "epoch" or None. This will determine when ``scheduler.step`` is called. If "batch", ``step`` will be called after every optimizer step. If "epoch", ``step`` will be called after one pass of the DataLoader. If a scheduler is passed in, this value is expected to not be None. :param use_tqdm: parameter for `horovod` and `torch_distributed` backends. You can monitor training progress if use_tqdm=True. :param workers_per_node: parameter for `horovod` and `torch_distributed` backends. worker number on each node. default: 1. :param model_dir: parameter for `bigdl` backend. The path to save model. During the training, if checkpoint_trigger is defined and triggered, the model will be saved to model_dir. :param backend: You can choose "horovod", "torch_distributed" or "bigdl" as backend. Default: `bigdl`. :return: an Estimator object. """ if backend in {"horovod", "torch_distributed"}: return PyTorchRayEstimator( model_creator=model, optimizer_creator=optimizer, loss_creator=loss, metrics=metrics, scheduler_creator=scheduler_creator, training_operator_cls=training_operator_cls, initialization_hook=initialization_hook, config=config, scheduler_step_freq=scheduler_step_freq, use_tqdm=use_tqdm, workers_per_node=workers_per_node, backend=backend) elif backend == "bigdl": return PyTorchSparkEstimator(model=model, loss=loss, optimizer=optimizer, config=config, metrics=metrics, model_dir=model_dir, bigdl_type="float") else: raise ValueError( "Only horovod, torch_distributed and bigdl backends are supported" f" for now, got backend: {backend}")
class PyTorchRayEstimatorWrapper(Estimator): def __init__(self, *, model_creator, optimizer_creator, loss_creator=None, scheduler_creator=None, training_operator_cls=TrainingOperator, initialization_hook=None, config=None, scheduler_step_freq="batch", use_tqdm=False, backend="torch_distributed", workers_per_node=1): from zoo.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator self.estimator = PyTorchRayEstimator(model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, training_operator_cls=training_operator_cls, initialization_hook=initialization_hook, config=config, scheduler_step_freq=scheduler_step_freq, use_tqdm=use_tqdm, backend=backend, workers_per_node=workers_per_node) def fit(self, data, epochs=1, batch_size=32, profile=False, reduce_results=True, info=None): """ Trains a PyTorch model given training data for several epochs. Calls `TrainingOperator.train_epoch()` on N parallel workers simultaneously underneath the hood. :param data: An instance of SparkXShards or a function that takes config as argument and returns a PyTorch DataLoader for training. :param epochs: The number of epochs to train the model. Default is 1. :param batch_size: The number of samples per batch for each worker. Default is 32. The total batch size would be workers_per_node*num_nodes. If you training data is a function, you can set batch_size to be config["batch_size"] for the PyTorch DataLoader. :param profile: Boolean. Whether to return time stats for the training procedure. Default is False. :param reduce_results: Boolean. Whether to average all metrics across all workers into one dict. If a metric is a non-numerical value (or nested dictionaries), one value will be randomly selected among the workers. If False, returns a list of dicts for all workers. Default is True. :param info: An optional dictionary that can be passed to the TrainingOperator for train_epoch and train_batch. :return A list of dictionary of metrics for every training epoch. If reduce_results is False, this will return a nested list of metric dictionaries whose length will be equal to the total number of workers. You can also provide custom metrics by passing in a custom training_operator_cls when creating the Estimator. """ return self.estimator.train(data=data, epochs=epochs, batch_size=batch_size, profile=profile, reduce_results=reduce_results, info=info) def predict(self, data, **kwargs): pass def evaluate(self, data, batch_size=32, num_steps=None, profile=False, info=None): """ Evaluates a PyTorch model given validation data. Note that only accuracy for classification with zero-based label is supported by default. You can override validate_batch in TrainingOperator for other metrics. Calls `TrainingOperator.validate()` on N parallel workers simultaneously underneath the hood. :param data: An instance of SparkXShards or a function that takes config as argument and returns a PyTorch DataLoader for validation. :param batch_size: The number of samples per batch for each worker. Default is 32. The total batch size would be workers_per_node*num_nodes. If you validation data is a function, you can set batch_size to be config["batch_size"] for the PyTorch DataLoader. :param num_steps: The number of batches to compute the validation results on. This corresponds to the number of times `TrainingOperator.validate_batch` is called. :param profile: Boolean. Whether to return time stats for the training procedure. Default is False. :param info: An optional dictionary that can be passed to the TrainingOperator for validate. :return A dictionary of metrics for the given data, including validation accuracy and loss. You can also provide custom metrics by passing in a custom training_operator_cls when creating the Estimator. """ return self.estimator.validate(data=data, batch_size=batch_size, num_steps=num_steps, profile=profile, info=info) def get_model(self): """Returns the learned model(s).""" return self.estimator.get_model() def save(self, checkpoint): """Saves the Estimator state to the provided checkpoint path. :param checkpoint: (str) Path to target checkpoint file. """ return self.estimator.save(checkpoint=checkpoint) def load(self, checkpoint): """Loads the Estimator and all workers from the provided checkpoint. :param checkpoint: (str) Path to target checkpoint file. """ return self.estimator.load(checkpoint=checkpoint) def shutdown(self, force=False): """Shuts down workers and releases resources.""" return self.estimator.shutdown(force=force)