Example #1
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 config=None,
                 batch_size=16):
        """Initializes the runner.

        Args:
            model_creator (dict -> torch.nn.Module): see pytorch_trainer.py.
            data_creator (dict -> Dataset, Dataset): see pytorch_trainer.py.
            optimizer_creator (torch.nn.Module, dict -> loss, optimizer):
                see pytorch_trainer.py.
            config (dict): see pytorch_trainer.py.
            batch_size (int): see pytorch_trainer.py.
        """

        self.model_creator = model_creator
        self.data_creator = data_creator
        self.optimizer_creator = optimizer_creator
        self.config = {} if config is None else config
        self.batch_size = batch_size
        self.verbose = True

        self.epoch = 0
        self._timers = {
            k: utils.TimerStat(window_size=1)
            for k in [
                "setup_proc", "setup_model", "get_state", "set_state",
                "validation", "training"
            ]
        }
Example #2
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 config=None,
                 dataloader_config=None,
                 batch_size=16):
        """Initializes the runner.

        Args:
            model_creator (dict -> torch.nn.Module): see pytorch_trainer.py
            data_creator (int, dict -> Dataset, Dataset): see
                pytorch_trainer.py.
            optimizer_creator (torch.nn.Module, dict -> loss, optimizer):
                see pytorch_trainer.py.
            loss_creator (dict -> loss | Loss class): see pytorch_trainer.py.
            train_function: see pytorch_trainer.py
            validation_function: see pytorch_trainer.py
            config (dict): see pytorch_trainer.py.
            dataloader_config (dict): See pytorch_trainer.py.
            batch_size (int): see pytorch_trainer.py.
        """
        self.model_creator = model_creator
        self.data_creator = data_creator
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.config = {} if config is None else config
        self.dataloader_config = {
            "num_workers": 2,
            "pin_memory": True
        } if dataloader_config is None else dataloader_config
        self.train_function = train_function or pytorch_utils.train
        self.validation_function = (validation_function
                                    or pytorch_utils.validate)
        self.batch_size = batch_size
        self.verbose = True

        self.epoch = 0
        self._timers = {
            k: utils.TimerStat(window_size=1)
            for k in [
                "setup_proc", "setup_model", "get_state", "set_state",
                "validation", "training"
            ]
        }

        self.models = None
        self.optimizers = None
        self.criterion = None
        self.train_loader = None
        self.validation_loader = None
Example #3
0
def train(train_iterator, model, criterion, optimizer):
    """Runs 1 training epoch"""
    batch_time = utils.AverageMeter()
    data_time = utils.AverageMeter()
    losses = utils.AverageMeter()

    timers = {k: utils.TimerStat() for k in ["d2h", "fwd", "grad", "apply"]}

    # switch to train mode
    model.train()

    end = time.time()

    for i, (features, target) in enumerate(train_iterator):
        # measure data loading time
        data_time.update(time.time() - end)

        # Create non_blocking tensors for distributed training
        with timers["d2h"]:
            if torch.cuda.is_available():
                features = features.cuda(non_blocking=True)
                target = target.cuda(non_blocking=True)

        # compute output
        with timers["fwd"]:
            output = model(features)
            loss = criterion(output, target)

            # measure accuracy and record loss
            losses.update(loss.item(), features.size(0))

        with timers["grad"]:
            # compute gradients in a backward pass
            optimizer.zero_grad()
            loss.backward()

        with timers["apply"]:
            # Call step of optimizer to update model params
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

    stats = {
        "batch_time": batch_time.avg,
        "batch_processed": losses.count,
        "train_loss": losses.avg,
        "data_time": data_time.avg,
    }
    stats.update({k: t.mean for k, t in timers.items()})
    return stats
Example #4
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 scheduler_creator=None,
                 train_function=None,
                 validation_function=None,
                 config=None,
                 dataloader_config=None,
                 batch_size=16,
                 use_fp16=False,
                 apex_args=None,
                 scheduler_step_freq="batch"):
        self.model_creator = model_creator
        self.data_creator = data_creator
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.scheduler_creator = scheduler_creator
        self.config = {} if config is None else config
        self.dataloader_config = {
            "num_workers": 2
        } if dataloader_config is None else dataloader_config
        self.train_function = train_function or pytorch_utils.train
        self.validation_function = (validation_function
                                    or pytorch_utils.validate)
        self.batch_size = batch_size
        self.verbose = True

        self.epoch = 0
        self._timers = {
            k: utils.TimerStat(window_size=1)
            for k in [
                "setup_proc", "setup_model", "get_state", "set_state",
                "validation", "training"
            ]
        }
        self.models = None
        self.optimizers = None
        self.criterion = None
        self.schedulers = None
        self.train_loader = None
        self.validation_loader = None
        self.use_fp16 = use_fp16
        self.apex_args = apex_args or {}
        if use_fp16 and not amp:
            raise ImportError(
                "Please install apex from "
                "https://www.github.com/nvidia/apex to use fp16 training.")
        self.scheduler_step_freq = scheduler_step_freq
Example #5
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 dataloader_config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        # TODO: add support for mixed precision
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.data_creator = data_creator
        self.train_function = train_function
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.validation_function = validation_function
        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        self.dataloader_config = dataloader_config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))
        self.backend = backend
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.max_replicas = num_replicas
        self.temp_dir = tempfile.mkdtemp(prefix="raysgd")
        self._num_failures = 0
        self._last_resize = float("-inf")
        self._start_workers(self.max_replicas)
Example #6
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 scheduler_creator=None,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 dataloader_config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto",
                 use_fp16=False,
                 apex_args=None,
                 scheduler_step_freq="batch"):
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.data_creator = data_creator
        self.train_function = train_function
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.scheduler_creator = scheduler_creator
        self.validation_function = validation_function
        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        self.dataloader_config = dataloader_config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))
        self.backend = backend
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.max_replicas = num_replicas

        self.use_fp16 = use_fp16

        if apex_args and not isinstance(apex_args, dict):
            raise ValueError("apex_args needs to be a dict object.")

        self.apex_args = apex_args
        self.temp_dir = tempfile.mkdtemp(prefix="raysgd")
        self._num_failures = 0
        self._last_resize = float("-inf")

        if scheduler_step_freq and (
                scheduler_step_freq not in pytorch_utils.VALID_SCHEDULER_STEP):
            raise ValueError(
                "Scheduler step freq must be in {}. Got {}".format(
                    pytorch_utils.VALID_SCHEDULER_STEP, scheduler_step_freq))

        self.scheduler_step_freq = scheduler_step_freq

        self._start_workers(self.max_replicas)
Example #7
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        """Sets up the PyTorch trainer.

        Args:
            model_creator (dict -> torch.nn.Module): creates the model
                using the config.
            data_creator (int, dict -> DataLoader, DataLoader): Function that
                takes in (batch_size, config) and returns two Torch DataLoader
                objects.
            optimizer_creator (torch.nn.Module, dict -> optimizer):
                creates the loss and optimizer using the model and the config.
            loss_creator (dict -> loss): Creates the loss function/criterion
                using the config.
            train_function: Trains a model for a epoch. This takes in (
                model, train_dataloader, criterion, optimizer, config), and
                returns a dict of training stats.
            validation_function: Runs validation. This takes in (
                model, val_dataloader, criterion, config) and returns a dict of
                validation stats.
            config (dict): configuration passed to "model_creator",
                "data_creator", "optimizer_creator", and "loss_creator".
            num_replicas (int): the number of workers used in distributed
                training.
            use_gpu (bool): Sets resource allocation for workers to 1 GPU
                if true.
            batch_size (int): batch size for an update.
            backend (string): backend used by distributed PyTorch.
        """
        # TODO: add support for mixed precision
        # TODO: add support for callbacks
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.train_function = train_function
        self.validation_function = validation_function
        self.config = {} if config is None else config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))

        if num_replicas == 1:
            # Generate actor class
            Runner = ray.remote(
                num_cpus=1, num_gpus=int(use_gpu))(PyTorchRunner)
            # Start workers
            self.workers = [
                Runner.remote(
                    model_creator,
                    data_creator,
                    optimizer_creator,
                    loss_creator,
                    train_function=train_function,
                    validation_function=validation_function,
                    config=self.config,
                    batch_size=batch_size)
            ]
            if initialization_hook:
                self.apply_all_workers(initialization_hook)
            # Get setup tasks in order to throw errors on failure
            ray.get(self.workers[0].setup.remote())
        else:
            # Generate actor class
            Runner = ray.remote(
                num_cpus=1, num_gpus=int(use_gpu))(DistributedPyTorchRunner)
            # Compute batch size per replica
            batch_size_per_replica = batch_size // num_replicas
            if batch_size % num_replicas > 0:
                new_batch_size = batch_size_per_replica * num_replicas
                logger.warning(
                    ("Changing batch size from {old_batch_size} to "
                     "{new_batch_size} to evenly distribute batches across "
                     "{num_replicas} replicas.").format(
                         old_batch_size=batch_size,
                         new_batch_size=new_batch_size,
                         num_replicas=num_replicas))
            # Start workers
            self.workers = [
                Runner.remote(
                    model_creator,
                    data_creator,
                    optimizer_creator,
                    loss_creator,
                    backend=backend,
                    train_function=train_function,
                    validation_function=validation_function,
                    config=self.config,
                    batch_size=batch_size_per_replica)
                for i in range(num_replicas)
            ]
            if initialization_hook:
                self.apply_all_workers(initialization_hook)

            # Compute URL for initializing distributed PyTorch
            ip = ray.get(self.workers[0].get_node_ip.remote())
            port = ray.get(self.workers[0].find_free_port.remote())
            address = "tcp://{ip}:{port}".format(ip=ip, port=port)
            # Get setup tasks in order to throw errors on failure
            ray.get([
                worker.setup.remote(address, i, len(self.workers))
                for i, worker in enumerate(self.workers)
            ])
Example #8
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator=pytorch_utils.sgd_mse_optimizer,
                 config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        """Sets up the PyTorch trainer.

        Args:
            model_creator (dict -> torch.nn.Module): creates the model
                using the config.
            data_creator (dict -> Dataset, Dataset): creates the training
                and validation data sets using the config.
            optimizer_creator (torch.nn.Module, dict -> loss, optimizer):
                creates the loss and optimizer using the model and the config.
            config (dict): configuration passed to 'model_creator',
                'data_creator', and 'optimizer_creator'.
            num_replicas (int): the number of workers used in distributed
                training.
            use_gpu (bool): Sets resource allocation for workers to 1 GPU
                if true.
            batch_size (int): batch size for an update.
            backend (string): backend used by distributed PyTorch.
        """
        # TODO: add support for mixed precision
        # TODO: add support for callbacks
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.config = {} if config is None else config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))

        if num_replicas == 1:
            # Generate actor class
            Runner = ray.remote(num_cpus=1,
                                num_gpus=int(use_gpu))(PyTorchRunner)
            # Start workers
            self.workers = [
                Runner.remote(model_creator, data_creator, optimizer_creator,
                              self.config, batch_size)
            ]
            # Get setup tasks in order to throw errors on failure
            ray.get(self.workers[0].setup.remote())
        else:
            # Geneate actor class
            Runner = ray.remote(
                num_cpus=1, num_gpus=int(use_gpu))(DistributedPyTorchRunner)
            # Compute batch size per replica
            batch_size_per_replica = batch_size // num_replicas
            if batch_size % num_replicas > 0:
                new_batch_size = batch_size_per_replica * num_replicas
                logger.warning(
                    ("Changing batch size from {old_batch_size} to "
                     "{new_batch_size} to evenly distribute batches across "
                     "{num_replicas} replicas.").format(
                         old_batch_size=batch_size,
                         new_batch_size=new_batch_size,
                         num_replicas=num_replicas))
            # Start workers
            self.workers = [
                Runner.remote(model_creator, data_creator, optimizer_creator,
                              self.config, batch_size_per_replica, backend)
                for i in range(num_replicas)
            ]
            # Compute URL for initializing distributed PyTorch
            ip = ray.get(self.workers[0].get_node_ip.remote())
            port = ray.get(self.workers[0].find_free_port.remote())
            address = "tcp://{ip}:{port}".format(ip=ip, port=port)
            # Get setup tasks in order to throw errors on failure
            ray.get([
                worker.setup.remote(address, i, len(self.workers))
                for i, worker in enumerate(self.workers)
            ])
Example #9
0
    def __init__(self,
                 model_creator,
                 data_creator,
                 optimizer_creator,
                 loss_creator,
                 train_function=None,
                 validation_function=None,
                 initialization_hook=None,
                 config=None,
                 num_replicas=1,
                 use_gpu=False,
                 batch_size=16,
                 backend="auto"):
        """Sets up the PyTorch trainer.

        Args:
            model_creator (dict -> torch.nn.Module): creates the model
                using the config.
            data_creator (int, dict -> DataLoader, DataLoader): Function that
                takes in (batch_size, config) and returns two Torch DataLoader
                objects.
            optimizer_creator (torch.nn.Module, dict -> optimizer):
                creates the loss and optimizer using the model and the config.
            loss_creator (dict -> loss): Creates the loss function/criterion
                using the config.
            train_function: Trains a model for a epoch. This takes in (
                model, train_dataloader, criterion, optimizer, config), and
                returns a dict of training stats.
            validation_function: Runs validation. This takes in (
                model, val_dataloader, criterion, config) and returns a dict of
                validation stats.
            config (dict): configuration passed to "model_creator",
                "data_creator", "optimizer_creator", and "loss_creator".
            num_replicas (int): the number of workers used in distributed
                training.
            use_gpu (bool): Sets resource allocation for workers to 1 GPU
                if true.
            batch_size (int): batch size for an update.
            backend (string): backend used by distributed PyTorch.
        """
        # TODO: add support for mixed precision
        # TODO: add support for callbacks
        if num_replicas > 1 and not dist.is_available():
            raise ValueError(
                ("Distributed PyTorch is not supported on macOS. "
                 "To run without distributed PyTorch, set 'num_replicas=1'. "
                 "For more information, see "
                 "https://github.com/pytorch/examples/issues/467."))

        self.model_creator = model_creator
        self.data_creator = data_creator
        self.train_function = train_function
        self.optimizer_creator = optimizer_creator
        self.loss_creator = loss_creator
        self.validation_function = validation_function
        self.initialization_hook = initialization_hook
        self.config = {} if config is None else config
        self.optimizer_timer = utils.TimerStat(window_size=1)

        if backend == "auto":
            backend = "nccl" if use_gpu else "gloo"

        logger.info("Using {} as backend.".format(backend))
        self.backend = backend
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.max_replicas = num_replicas
        self.temp_dir = tempfile.mkdtemp(prefix="raysgd")
        self._num_failures = 0
        self._last_resize = float("-inf")
        self._start_workers(self.max_replicas)