コード例 #1
0
class PyTorchRayEstimator(OrcaRayEstimator):
    def __init__(self,
                 *,
                 model_creator,
                 optimizer_creator,
                 loss_creator=None,
                 metrics=None,
                 scheduler_creator=None,
                 training_operator_cls=TrainingOperator,
                 initialization_hook=None,
                 config=None,
                 scheduler_step_freq="batch",
                 use_tqdm=False,
                 backend="torch_distributed",
                 workers_per_node=1):
        if config is not None and "batch_size" in config:
            raise Exception(
                "Please do not specify batch_size in config. Input batch_size in the"
                " fit/evaluate/predict function of the estimator instead.")

        from zoo.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator
        self.estimator = PyTorchRayEstimator(
            model_creator=model_creator,
            optimizer_creator=optimizer_creator,
            loss_creator=loss_creator,
            metrics=metrics,
            scheduler_creator=scheduler_creator,
            training_operator_cls=training_operator_cls,
            initialization_hook=initialization_hook,
            config=config,
            scheduler_step_freq=scheduler_step_freq,
            use_tqdm=use_tqdm,
            backend=backend,
            workers_per_node=workers_per_node)

    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            profile=False,
            reduce_results=True,
            info=None,
            feature_cols=None,
            label_cols=None):
        """
        Trains a PyTorch model given training data for several epochs.

        Calls `TrainingOperator.train_epoch()` on N parallel workers simultaneously
        underneath the hood.
        :param data: An instance of SparkXShards, a Spark DataFrame or a function that
               takes config and batch_size as argument and returns a PyTorch DataLoader for
               training.
        :param epochs: The number of epochs to train the model. Default is 1.
        :param batch_size: The number of samples per batch for each worker. Default is 32.
               The total batch size would be workers_per_node*num_nodes.
               If your training data is a function, you can set batch_size to be the input
               batch_size of the function for the PyTorch DataLoader.
        :param profile: Boolean. Whether to return time stats for the training procedure.
               Default is False.
        :param reduce_results: Boolean. Whether to average all metrics across all workers into
               one dict. If a metric is a non-numerical value (or nested dictionaries), one value
               will be randomly selected among the workers. If False, returns a list of dicts for
               all workers.
               Default is True.
        :param info: An optional dictionary that can be passed to the TrainingOperator for
               train_epoch and train_batch.
        :param feature_cols: feature column names if data is Spark DataFrame.
        :param label_cols: label column names if data is Spark DataFrame.

        :return A list of dictionary of metrics for every training epoch. If reduce_results is
                False, this will return a nested list of metric dictionaries whose length will be
                equal to the total number of workers.
                You can also provide custom metrics by passing in a custom training_operator_cls
                when creating the Estimator.
        """
        return self.estimator.train(data=data,
                                    epochs=epochs,
                                    batch_size=batch_size,
                                    profile=profile,
                                    reduce_results=reduce_results,
                                    info=info,
                                    feature_cols=feature_cols,
                                    label_cols=label_cols)

    def predict(self, data, batch_size=32, feature_cols=None, profile=False):
        """
        Using this PyTorch model to make predictions on the data.

        :param data: An instance of SparkXShards or a Spark DataFrame
        :param batch_size: The number of samples per batch for each worker. Default is 32.
        :param profile: Boolean. Whether to return time stats for the training procedure.
               Default is False.
        :param feature_cols: feature column names if data is a Spark DataFrame.
        :return A SparkXShards that contains the predictions with key "prediction" in each shard
        """
        return self.estimator.predict(data,
                                      batch_size=batch_size,
                                      feature_cols=feature_cols,
                                      profile=profile)

    def evaluate(self,
                 data,
                 batch_size=32,
                 num_steps=None,
                 profile=False,
                 info=None,
                 feature_cols=None,
                 label_cols=None):
        """
        Evaluates a PyTorch model given validation data.
        Note that only accuracy for classification with zero-based label is supported by
        default. You can override validate_batch in TrainingOperator for other metrics.

        Calls `TrainingOperator.validate()` on N parallel workers simultaneously
        underneath the hood.
        :param data: An instance of SparkXShards, a Spark DataFrame or a function that
               takes config and batch_size as argument and returns a PyTorch DataLoader for
               validation.
        :param batch_size: The number of samples per batch for each worker. Default is 32.
               The total batch size would be workers_per_node*num_nodes.
               If your validation data is a function, you can set batch_size to be the input
               batch_size of the function for the PyTorch DataLoader.
        :param num_steps: The number of batches to compute the validation results on. This
               corresponds to the number of times `TrainingOperator.validate_batch` is called.
        :param profile: Boolean. Whether to return time stats for the training procedure.
               Default is False.
        :param info: An optional dictionary that can be passed to the TrainingOperator
               for validate.
        :param feature_cols: feature column names if train data is Spark DataFrame.
        :param label_cols: label column names if train data is Spark DataFrame.

        :return A dictionary of metrics for the given data, including validation accuracy and loss.
                You can also provide custom metrics by passing in a custom training_operator_cls
                when creating the Estimator.
        """
        return self.estimator.validate(data=data,
                                       batch_size=batch_size,
                                       num_steps=num_steps,
                                       profile=profile,
                                       info=info,
                                       feature_cols=feature_cols,
                                       label_cols=label_cols)

    def get_model(self):
        """
        Returns the learned PyTorch model.

        :return: The learned PyTorch model.
        """
        return self.estimator.get_model()

    def save(self, model_path):
        """
        Saves the Estimator state (including model and optimizer) to the provided model_path.

        :param model_path: (str) Path to save the model.
        :return:
        """
        return self.estimator.save(model_path)

    def load(self, model_path):
        """
        Loads the Estimator state (including model and optimizer) from the provided model_path.

        :param model_path: (str) Path to the existing model.
        """
        return self.estimator.load(model_path)

    def shutdown(self, force=False):
        """
        Shuts down workers and releases resources.

        :return:
        """
        return self.estimator.shutdown(force=force)
コード例 #2
0
    def from_torch(*,
                   model,
                   optimizer,
                   loss=None,
                   metrics=None,
                   scheduler_creator=None,
                   training_operator_cls=TrainingOperator,
                   initialization_hook=None,
                   config=None,
                   scheduler_step_freq="batch",
                   use_tqdm=False,
                   workers_per_node=1,
                   model_dir=None,
                   backend="bigdl"):
        """
        Create an Estimator for torch.

        :param model: PyTorch model or model creator function if backend="bigdl", PyTorch
               model creator function if backend="horovod" or "torch_distributed"
        :param optimizer: Orca/PyTorch optimizer or optimizer creator function if backend="bigdl"
               , PyTorch optimizer creator function if backend="horovod" or "torch_distributed"
        :param loss: PyTorch loss or loss creator function if backend="bigdl", PyTorch loss creator
               function if backend="horovod" or "torch_distributed"
        :param metrics: Orca validation methods for evaluate.
        :param scheduler_creator: parameter for `horovod` and `torch_distributed` backends. a
               learning rate scheduler wrapping the optimizer. You will need to set
               ``scheduler_step_freq="epoch"`` for the scheduler to be incremented correctly.
        :param config: parameter config dict to create model, optimizer loss and data.
        :param scheduler_step_freq: parameter for `horovod` and `torch_distributed` backends.
               "batch", "epoch" or None. This will determine when ``scheduler.step`` is called. If
               "batch", ``step`` will be called after every optimizer step. If "epoch", ``step``
               will be called after one pass of the DataLoader. If a scheduler is passed in, this
               value is expected to not be None.
        :param use_tqdm: parameter for `horovod` and `torch_distributed` backends. You can monitor
               training progress if use_tqdm=True.
        :param workers_per_node: parameter for `horovod` and `torch_distributed` backends. worker
               number on each node. default: 1.
        :param model_dir: parameter for `bigdl` backend. The path to save model. During the
               training, if checkpoint_trigger is defined and triggered, the model will be saved to
               model_dir.
        :param backend: You can choose "horovod",  "torch_distributed" or "bigdl" as backend.
               Default: `bigdl`.
        :return: an Estimator object.
        """
        if backend in {"horovod", "torch_distributed"}:
            return PyTorchRayEstimator(
                model_creator=model,
                optimizer_creator=optimizer,
                loss_creator=loss,
                metrics=metrics,
                scheduler_creator=scheduler_creator,
                training_operator_cls=training_operator_cls,
                initialization_hook=initialization_hook,
                config=config,
                scheduler_step_freq=scheduler_step_freq,
                use_tqdm=use_tqdm,
                workers_per_node=workers_per_node,
                backend=backend)
        elif backend == "bigdl":
            return PyTorchSparkEstimator(model=model,
                                         loss=loss,
                                         optimizer=optimizer,
                                         config=config,
                                         metrics=metrics,
                                         model_dir=model_dir,
                                         bigdl_type="float")
        else:
            raise ValueError(
                "Only horovod, torch_distributed and bigdl backends are supported"
                f" for now, got backend: {backend}")
コード例 #3
0
ファイル: estimator.py プロジェクト: DingHe/analytics-zoo
class PyTorchRayEstimatorWrapper(Estimator):
    def __init__(self,
                 *,
                 model_creator,
                 optimizer_creator,
                 loss_creator=None,
                 scheduler_creator=None,
                 training_operator_cls=TrainingOperator,
                 initialization_hook=None,
                 config=None,
                 scheduler_step_freq="batch",
                 use_tqdm=False,
                 backend="torch_distributed",
                 workers_per_node=1):
        from zoo.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator
        self.estimator = PyTorchRayEstimator(model_creator=model_creator,
                                             optimizer_creator=optimizer_creator,
                                             loss_creator=loss_creator,
                                             scheduler_creator=scheduler_creator,
                                             training_operator_cls=training_operator_cls,
                                             initialization_hook=initialization_hook,
                                             config=config,
                                             scheduler_step_freq=scheduler_step_freq,
                                             use_tqdm=use_tqdm,
                                             backend=backend,
                                             workers_per_node=workers_per_node)

    def fit(self, data, epochs=1, batch_size=32, profile=False, reduce_results=True, info=None):
        """
        Trains a PyTorch model given training data for several epochs.

        Calls `TrainingOperator.train_epoch()` on N parallel workers simultaneously
        underneath the hood.
        :param data: An instance of SparkXShards or a function that takes config as
        argument and returns a PyTorch DataLoader for training.
        :param epochs: The number of epochs to train the model. Default is 1.
        :param batch_size: The number of samples per batch for each worker. Default is 32.
        The total batch size would be workers_per_node*num_nodes.
        If you training data is a function, you can set batch_size to be config["batch_size"]
        for the PyTorch DataLoader.
        :param profile: Boolean. Whether to return time stats for the training procedure.
        Default is False.
        :param reduce_results: Boolean. Whether to average all metrics across all workers into
        one dict. If a metric is a non-numerical value (or nested dictionaries), one value will
        be randomly selected among the workers. If False, returns a list of dicts for all workers.
        Default is True.
        :param info: An optional dictionary that can be passed to the TrainingOperator for
        train_epoch and train_batch.

        :return A list of dictionary of metrics for every training epoch. If reduce_results is
        False, this will return a nested list of metric dictionaries whose length will be equal
        to the total number of workers.
        You can also provide custom metrics by passing in a custom training_operator_cls when
        creating the Estimator.
        """
        return self.estimator.train(data=data, epochs=epochs, batch_size=batch_size,
                                    profile=profile, reduce_results=reduce_results, info=info)

    def predict(self, data, **kwargs):
        pass

    def evaluate(self, data, batch_size=32, num_steps=None, profile=False, info=None):
        """
        Evaluates a PyTorch model given validation data.
        Note that only accuracy for classification with zero-based label is supported by
        default. You can override validate_batch in TrainingOperator for other metrics.

        Calls `TrainingOperator.validate()` on N parallel workers simultaneously
        underneath the hood.
        :param data: An instance of SparkXShards or a function that takes config as
        argument and returns a PyTorch DataLoader for validation.
        :param batch_size: The number of samples per batch for each worker. Default is 32.
        The total batch size would be workers_per_node*num_nodes.
        If you validation data is a function, you can set batch_size to be config["batch_size"]
        for the PyTorch DataLoader.
        :param num_steps: The number of batches to compute the validation results on. This
        corresponds to the number of times `TrainingOperator.validate_batch` is called.
        :param profile: Boolean. Whether to return time stats for the training procedure.
        Default is False.
        :param info: An optional dictionary that can be passed to the TrainingOperator
        for validate.

        :return A dictionary of metrics for the given data, including validation accuracy and loss.
        You can also provide custom metrics by passing in a custom training_operator_cls when
        creating the Estimator.
        """
        return self.estimator.validate(data=data, batch_size=batch_size, num_steps=num_steps,
                                       profile=profile, info=info)

    def get_model(self):
        """Returns the learned model(s)."""
        return self.estimator.get_model()

    def save(self, checkpoint):
        """Saves the Estimator state to the provided checkpoint path.

        :param checkpoint: (str) Path to target checkpoint file.
        """
        return self.estimator.save(checkpoint=checkpoint)

    def load(self, checkpoint):
        """Loads the Estimator and all workers from the provided checkpoint.

        :param checkpoint: (str) Path to target checkpoint file.
        """
        return self.estimator.load(checkpoint=checkpoint)

    def shutdown(self, force=False):
        """Shuts down workers and releases resources."""
        return self.estimator.shutdown(force=force)