Ejemplo n.º 1
0
    def backward(ctx, grad_output):
        # mini batch mean & var are calculated by forward path.
        # mu = 1./N*np.sum(h, axis = 0)
        # var = 1./N*np.sum((h-mu)**2, axis = 0)
        last_input, mean, var = ctx.saved_tensors

        eps = ctx.eps
        grad_input = None
        num_features = mean.size()[0]

        # calculate grad_input
        if ctx.needs_input_grad[0]:
            # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0)
            #     - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0))
            mean_dy = grad_output.mean(0)
            mean_dy_xmu = (
                (grad_output * (last_input - mean)).view(-1, num_features).mean(0)
            )
            # If running on a distributed setting, perform mean reduction of tensors over
            # all processes.
            mean_dy = all_reduce_mean(mean_dy)
            mean_dy_xmu = all_reduce_mean(mean_dy_xmu)

            grad_input = (
                grad_output - mean_dy - (last_input - mean) / (var + eps) * mean_dy_xmu
            ) / torch.sqrt(var + eps)

        return grad_input, None
Ejemplo n.º 2
0
    def synchronize_losses(self):
        """Average the losses across the different replicas"""

        # Average losses across nodes
        losses_tensor = torch.tensor(self.losses)
        synchronized_losses_tensor = all_reduce_mean(losses_tensor)
        self.losses = synchronized_losses_tensor.tolist()
Ejemplo n.º 3
0
    def eval_step(self, use_gpu):
        self.last_batch = None

        # Process next sample
        sample = next(self.get_data_iterator())

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        target = sample["target"]
        if use_gpu:
            for key, value in sample.items():
                sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

        with torch.no_grad():
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()
            loss = all_reduce_mean(loss)

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(loss=loss,
                                        output=output,
                                        target=target,
                                        sample=sample)
Ejemplo n.º 4
0
    def train_step(self, use_gpu):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
        """

        self.last_batch = None

        # Process next sample
        sample = next(self.get_data_iterator())

        assert isinstance(
            sample, dict) and "input" in sample and "target" in sample, (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        target = sample["target"]
        if use_gpu:
            for key, value in sample.items():
                sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

        with torch.enable_grad():
            # Forward pass
            output = self.model(sample["input"])

            local_loss = self.compute_loss(output, sample)

            loss = local_loss.detach().clone()
            loss = all_reduce_mean(loss)

            self.losses.append(loss.data.cpu().item() * target.size(0))

            self.update_meters(output, sample)

        # Run backwards pass / update optimizer
        if self.amp_args is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(local_loss,
                                     self.optimizer.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_loss)

        self.check_inf_nan(loss)

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()

        # Move some data to the task so hooks get a chance to access it
        self.last_batch = LastBatchInfo(loss=loss,
                                        output=output,
                                        target=target,
                                        sample=sample)
Ejemplo n.º 5
0
    def forward(ctx, input, eps):
        with torch.no_grad():
            local_mean = torch.mean(input, 0)
            local_sqr_mean = torch.pow(input, 2).mean(0)

            # If running on a distributed setting, perform mean reduction of tensors over
            # all processes.
            mean = all_reduce_mean(local_mean)
            sqr_mean = all_reduce_mean(local_sqr_mean)

            # var(x) = E (( x - mean_x ) ** 2)
            #        = 1 / N * sum ( x - mean_x ) ** 2
            #        = 1 / N * sum (x**2) - mean_x**2
            var = sqr_mean - mean.pow(2)

        ctx.save_for_backward(input, mean, var)
        ctx.eps = eps

        return (input - mean) / torch.sqrt(var + eps)
Ejemplo n.º 6
0
 def sync_memory(self):
     """
     Sync memory across all processes before first forward pass. Only needed
     in the distributed case.
     After the first forward pass, the update_memory function in NCEAverage
     does a gather over all embeddings, so memory stays in sync. Doing a gather
     over embeddings is O(batch size). Syncing memory is O(num items in memory).
     Generally, batch size << num items in memory. So, we prefer doing the syncs
     in update_memory.
     """
     self.nce_average.memory = all_reduce_mean(self.nce_average.memory)
     logging.info(f"Rank: {get_rank()}: Memory synced")
     # set to true once we are done. forward pass in nce_average will sync after.
     self.init_sync_memory = True
Ejemplo n.º 7
0
    def forward(self, embedding: torch.Tensor) -> torch.tensor:
        """
        Calculate the loss. Operates on embeddings tensor.

        Args:
            embedding (torch.Tensor):   NxEMBEDDING_DIM
                                        Must contain the concatenated embeddings
                                        of the two image copies:
                                        [emb_img1_0, emb_img2_0, ....., emb_img1_1, emb_img2_1,...]
        """
        assert embedding.ndim == 2 and embedding.shape[1] == int(
            self.embedding_dim
        ), f"Incorrect embedding shape: {embedding.shape} but expected Nx{self.embedding_dim}"

        batch_size = embedding.shape[0]
        assert (
            batch_size % self.num_copies == 0
        ), f"Batch size {batch_size} should be divisible by num_copies ({self.num_copies})."

        # normalize embeddings along the batch dimension
        embedding_normed = SyncNormalizeFunction.apply(embedding, self.eps)

        # split embedding between copies
        embedding_normed_a, embedding_normed_b = torch.split(
            embedding_normed,
            split_size_or_sections=batch_size // self.num_copies,
            dim=0,
        )

        # cross-correlation matrix
        correlation_matrix = torch.mm(
            embedding_normed_a.T,
            embedding_normed_b) / (batch_size / self.num_copies)

        # Reduce cross-correlation matrices from all processes
        correlation_matrix = all_reduce_mean(correlation_matrix)

        # loss
        on_diag = (torch.diagonal(correlation_matrix).add(-1).pow(2).sum().mul(
            self.scale_loss))
        off_diag = (self._off_diagonal(correlation_matrix).pow(2).sum().mul(
            self.scale_loss))
        loss = on_diag + self.lambda_ * off_diag

        return loss
    def eval_step(self, use_gpu, local_variables=None):
        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        with torch.no_grad():
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)
Ejemplo n.º 9
0
def standard_train_step(task):
    """
    Single training iteration loop of the model.

    Performs: data read, forward, loss computation, backward, optimizer step, parameter updates.

    Various intermediate steps are also performed:
    - logging the training loss, training eta, LR, etc to loggers
    - logging to tensorboard,
    - performing any self-supervised method specific operations (like in MoCo approach, the
    momentum encoder is updated), computing the scores in swav
    - checkpointing model if user wants to checkpoint in the middle
    of an epoch
    """
    assert isinstance(task, ClassyTask), "task is not instance of ClassyTask"

    # reset the last batch info at every step
    task.last_batch = LastBatchInfo()

    # We'll time train_step and some of its sections, and accumulate values
    # into perf_stats if it were defined in local_variables:
    perf_stats = task.perf_stats
    timer_train_step = PerfTimer("train_step_total", perf_stats)
    timer_train_step.start()

    # Process next sample
    with PerfTimer("read_sample", perf_stats):
        sample = next(task.data_iterator)

    sample = construct_sample_for_model(sample, task)

    # Only need gradients during training
    grad_context = torch.enable_grad() if task.train else torch.no_grad()
    ddp_context = (
        task.model.no_sync()
        if task.enable_manual_gradient_reduction
        else contextlib.suppress()
    )
    torch_amp_context = (
        torch.cuda.amp.autocast()
        if task.amp_type == AmpType.PYTORCH
        else contextlib.suppress()
    )

    with grad_context, ddp_context, torch_amp_context:
        # Forward pass of the model
        with PerfTimer("forward", perf_stats):
            if task.enable_manual_gradient_reduction:
                # Manually sync params and buffers for DDP.
                manual_sync_params(task.model)
            model_output = task.model(sample["input"])

        # If the model outputs only one tensor, we take it out of the list.
        if len(model_output) == 1:
            model_output = model_output[0]

        task.last_batch.sample = sample
        task.last_batch.model_output = model_output
        target = sample["target"]

        # Run hooks on forward pass
        task.run_hooks(SSLClassyHookFunctions.on_forward.name)

        # Compute loss
        with PerfTimer("loss_compute", perf_stats):
            local_loss = task.loss(model_output, target)

        # Reduce the loss value across all nodes and gpus.
        with PerfTimer("loss_all_reduce", perf_stats):
            loss = local_loss.detach().clone()
            task.last_batch.loss = all_reduce_mean(loss)

        task.losses.append(task.last_batch.loss.data.cpu().item() * target.size(0))

        # Update meters
        if len(task.meters) > 0 and (
            (task.train and task.config["METERS"]["enable_training_meter"])
            or (not task.train)
        ):
            with PerfTimer("meters_update", perf_stats):
                if isinstance(model_output, list):
                    model_output_cpu = [x.cpu() for x in model_output]
                else:
                    model_output_cpu = model_output.cpu()

                for meter in task.meters:
                    meter.update(model_output_cpu, target.detach().cpu())

        task.last_batch.model_output = model_output
        task.last_batch.target = target

        # Update the iteration number, check loss is not NaN and measure batch time
        # now if it's a test phase since test phase doesn't have update step.
        task.run_hooks(SSLClassyHookFunctions.on_loss_and_meter.name)

    # Run backward now and update the optimizer
    if task.train:
        with PerfTimer("backward", perf_stats):

            task.optimizer.zero_grad()
            if task.amp_type == AmpType.APEX:
                with apex.amp.scale_loss(
                    local_loss, task.optimizer.optimizer
                ) as scaled_loss:
                    scaled_loss.backward()
                    if task.enable_manual_gradient_reduction:
                        manual_gradient_all_reduce(task.model)

            elif task.amp_type == AmpType.PYTORCH:
                task.amp_grad_scaler.scale(local_loss).backward()
                if task.enable_manual_gradient_reduction:
                    manual_gradient_all_reduce(task.model)
            else:
                local_loss.backward()
                if task.enable_manual_gradient_reduction:
                    manual_gradient_all_reduce(task.model)

        task.run_hooks(SSLClassyHookFunctions.on_backward.name)

        # Stepping the optimizer also updates learning rate, momentum etc
        # according to the schedulers (if any).
        with PerfTimer("optimizer_step", perf_stats):
            assert task.where < 1.0, (
                "Optimizer being called with where=1.0. This should not happen "
                "as where=1.0 means training is already finished. Please debug your "
                "training setup. A common issue is the data sampler resuming "
                "where you are checkpointing model at every iterations but not using "
                "the stateful data sampler OR there's an issue in properly resuming the "
                "data sampler."
            )
            if task.amp_type == AmpType.PYTORCH:
                task.amp_grad_scaler.step(task.optimizer, where=task.where)
                task.amp_grad_scaler.update()
            else:
                task.optimizer.step(where=task.where)
        task.run_hooks(SSLClassyHookFunctions.on_update.name)
        task.num_updates += task.get_global_batchsize()

    timer_train_step.stop()
    timer_train_step.record()

    return task
Ejemplo n.º 10
0
    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """
        from classy_vision.hooks import ClassyHookFunctions

        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        # Only need gradients during training
        context = torch.enable_grad() if self.train else torch.no_grad()
        with context:
            # Forward pass
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            # NOTE: This performs an all_reduce_mean() on the losses across the
            # replicas.  The reduce should ideally be weighted by the length of
            # the targets on each replica. This will only be an issue when
            # there are dummy samples present (once an epoch) and will only
            # impact the loss reporting (slightly).
            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

            # After both loss and meters are updated, we run hooks. Among hooks,
            # `LossLrMeterLoggingHook` will log both loss and meter status
            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)

        num_samples_in_step = self.get_global_batchsize()
        self.num_samples_this_phase += num_samples_in_step

        # For training phases, run backwards pass / update optimizer
        if self.train:
            if self.amp_opt_level is not None:
                self.optimizer.zero_grad()
                with apex.amp.scale_loss(
                        local_variables["local_loss"],
                        self.optimizer.optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                self.optimizer.backward(local_variables["local_loss"])

            self.optimizer.update_schedule_on_step(self.where)
            self.optimizer.step()

            self.run_hooks(local_variables, ClassyHookFunctions.on_update.name)

            self.num_updates += num_samples_in_step
Ejemplo n.º 11
0
    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """
        from classy_vision.hooks import ClassyHookFunctions

        if local_variables is None:
            local_variables = {}

        # We'll time train_step and some of its sections, and accumulate values
        # into perf_stats if it were defined in local_variables:
        perf_stats = local_variables.get("perf_stats", None)
        timer_train_step = PerfTimer("train_step_total", perf_stats)
        timer_train_step.start()

        # Process next sample
        with PerfTimer("read_sample", perf_stats):
            sample = next(self.get_data_iterator())
            local_variables["sample"] = sample

            assert (
                isinstance(local_variables["sample"], dict)
                and "input" in local_variables["sample"]
                and "target" in local_variables["sample"]
            ), (f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        self.run_hooks(local_variables, ClassyHookFunctions.on_sample.name)

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        # Only need gradients during training
        context = torch.enable_grad() if self.train else torch.no_grad()
        with context:
            # Forward pass
            with PerfTimer("forward", perf_stats):
                local_variables["output"] = self.model(
                    local_variables["sample"]["input"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_forward.name)

            model_output = local_variables["output"]
            target = local_variables["sample"]["target"]
            local_variables["local_loss"] = self.loss(model_output, target)

            # NOTE: This performs an all_reduce_mean() on the losses across the
            # replicas.  The reduce should ideally be weighted by the length of
            # the targets on each replica. This will only be an issue when
            # there are dummy samples present (once an epoch) and will only
            # impact the loss reporting (slightly).
            with PerfTimer("loss_allreduce", perf_stats):
                local_variables["loss"] = local_variables["local_loss"].detach(
                ).clone()
                local_variables["loss"] = all_reduce_mean(
                    local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            model_output_cpu = model_output.cpu() if use_gpu else model_output

            # Update meters
            with PerfTimer("meters_update", perf_stats):
                for meter in self.meters:
                    meter.update(model_output_cpu,
                                 target.detach().cpu(),
                                 is_train=self.train)
            # After both loss and meters are updated, we run hooks. Among hooks,
            # `LossLrMeterLoggingHook` will log both loss and meter status
            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_loss_and_meter.name)

        num_samples_in_step = self.get_global_batchsize()
        self.num_samples_this_phase += num_samples_in_step

        # For training phases, run backwards pass / update optimizer
        if self.train:
            with PerfTimer("backward", perf_stats):
                self.optimizer.backward(local_variables["local_loss"])

            self.run_hooks(local_variables,
                           ClassyHookFunctions.on_backward.name)

            self.optimizer.update_schedule_on_step(self.where)
            with PerfTimer("optimizer_step", perf_stats):
                self.optimizer.step()

            self.run_hooks(local_variables, ClassyHookFunctions.on_update.name)

            self.num_updates += num_samples_in_step

        timer_train_step.stop()
        timer_train_step.record()
Ejemplo n.º 12
0
    def train_step(self, use_gpu, local_variables=None):
        """Train step to be executed in train loop

        Args:
            use_gpu: if true, execute training on GPU
            local_variables: Dict containing intermediate values
                in train_step for access by hooks
        """

        if local_variables is None:
            local_variables = {}

        # Process next sample
        sample = next(self.get_data_iterator())
        local_variables["sample"] = sample

        assert (
            isinstance(local_variables["sample"], dict)
            and "input" in local_variables["sample"]
            and "target" in local_variables["sample"]), (
                f"Returned sample [{sample}] is not a map with 'input' and" +
                "'target' keys")

        # Copy sample to GPU
        local_variables["target"] = local_variables["sample"]["target"]
        if use_gpu:
            for key, value in local_variables["sample"].items():
                local_variables["sample"][key] = recursive_copy_to_gpu(
                    value, non_blocking=True)

        with torch.enable_grad():
            # Forward pass
            local_variables["output"] = self.model(
                local_variables["sample"]["input"])

            local_variables["local_loss"] = self.compute_loss(
                local_variables["output"], local_variables["sample"])

            local_variables["loss"] = local_variables["local_loss"].detach(
            ).clone()
            local_variables["loss"] = all_reduce_mean(local_variables["loss"])

            self.losses.append(local_variables["loss"].data.cpu().item() *
                               local_variables["target"].size(0))

            self.update_meters(local_variables["output"],
                               local_variables["sample"])

        # Run backwards pass / update optimizer
        if self.amp_opt_level is not None:
            self.optimizer.zero_grad()
            with apex.amp.scale_loss(local_variables["local_loss"],
                                     self.optimizer.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.optimizer.backward(local_variables["local_loss"])

        self.optimizer.update_schedule_on_step(self.where)
        self.optimizer.step()

        self.num_updates += self.get_global_batchsize()