def backward(ctx, grad_output): # mini batch mean & var are calculated by forward path. # mu = 1./N*np.sum(h, axis = 0) # var = 1./N*np.sum((h-mu)**2, axis = 0) last_input, mean, var = ctx.saved_tensors eps = ctx.eps grad_input = None num_features = mean.size()[0] # calculate grad_input if ctx.needs_input_grad[0]: # dh = gamma * (var + eps)**(-1. / 2.) * (dy - np.mean(dy, axis=0) # - (h - mu) * (var + eps)**(-1.0) * np.mean(dy * (h - mu), axis=0)) mean_dy = grad_output.mean(0) mean_dy_xmu = ( (grad_output * (last_input - mean)).view(-1, num_features).mean(0) ) # If running on a distributed setting, perform mean reduction of tensors over # all processes. mean_dy = all_reduce_mean(mean_dy) mean_dy_xmu = all_reduce_mean(mean_dy_xmu) grad_input = ( grad_output - mean_dy - (last_input - mean) / (var + eps) * mean_dy_xmu ) / torch.sqrt(var + eps) return grad_input, None
def synchronize_losses(self): """Average the losses across the different replicas""" # Average losses across nodes losses_tensor = torch.tensor(self.losses) synchronized_losses_tensor = all_reduce_mean(losses_tensor) self.losses = synchronized_losses_tensor.tolist()
def eval_step(self, use_gpu): self.last_batch = None # Process next sample sample = next(self.get_data_iterator()) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU target = sample["target"] if use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.no_grad(): output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() loss = all_reduce_mean(loss) self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo(loss=loss, output=output, target=target, sample=sample)
def train_step(self, use_gpu): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU """ self.last_batch = None # Process next sample sample = next(self.get_data_iterator()) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU target = sample["target"] if use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.enable_grad(): # Forward pass output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() loss = all_reduce_mean(loss) self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Run backwards pass / update optimizer if self.amp_args is not None: self.optimizer.zero_grad() with apex.amp.scale_loss(local_loss, self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_loss) self.check_inf_nan(loss) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize() # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo(loss=loss, output=output, target=target, sample=sample)
def forward(ctx, input, eps): with torch.no_grad(): local_mean = torch.mean(input, 0) local_sqr_mean = torch.pow(input, 2).mean(0) # If running on a distributed setting, perform mean reduction of tensors over # all processes. mean = all_reduce_mean(local_mean) sqr_mean = all_reduce_mean(local_sqr_mean) # var(x) = E (( x - mean_x ) ** 2) # = 1 / N * sum ( x - mean_x ) ** 2 # = 1 / N * sum (x**2) - mean_x**2 var = sqr_mean - mean.pow(2) ctx.save_for_backward(input, mean, var) ctx.eps = eps return (input - mean) / torch.sqrt(var + eps)
def sync_memory(self): """ Sync memory across all processes before first forward pass. Only needed in the distributed case. After the first forward pass, the update_memory function in NCEAverage does a gather over all embeddings, so memory stays in sync. Doing a gather over embeddings is O(batch size). Syncing memory is O(num items in memory). Generally, batch size << num items in memory. So, we prefer doing the syncs in update_memory. """ self.nce_average.memory = all_reduce_mean(self.nce_average.memory) logging.info(f"Rank: {get_rank()}: Memory synced") # set to true once we are done. forward pass in nce_average will sync after. self.init_sync_memory = True
def forward(self, embedding: torch.Tensor) -> torch.tensor: """ Calculate the loss. Operates on embeddings tensor. Args: embedding (torch.Tensor): NxEMBEDDING_DIM Must contain the concatenated embeddings of the two image copies: [emb_img1_0, emb_img2_0, ....., emb_img1_1, emb_img2_1,...] """ assert embedding.ndim == 2 and embedding.shape[1] == int( self.embedding_dim ), f"Incorrect embedding shape: {embedding.shape} but expected Nx{self.embedding_dim}" batch_size = embedding.shape[0] assert ( batch_size % self.num_copies == 0 ), f"Batch size {batch_size} should be divisible by num_copies ({self.num_copies})." # normalize embeddings along the batch dimension embedding_normed = SyncNormalizeFunction.apply(embedding, self.eps) # split embedding between copies embedding_normed_a, embedding_normed_b = torch.split( embedding_normed, split_size_or_sections=batch_size // self.num_copies, dim=0, ) # cross-correlation matrix correlation_matrix = torch.mm( embedding_normed_a.T, embedding_normed_b) / (batch_size / self.num_copies) # Reduce cross-correlation matrices from all processes correlation_matrix = all_reduce_mean(correlation_matrix) # loss on_diag = (torch.diagonal(correlation_matrix).add(-1).pow(2).sum().mul( self.scale_loss)) off_diag = (self._off_diagonal(correlation_matrix).pow(2).sum().mul( self.scale_loss)) loss = on_diag + self.lambda_ * off_diag return loss
def eval_step(self, use_gpu, local_variables=None): if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) with torch.no_grad(): local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name)
def standard_train_step(task): """ Single training iteration loop of the model. Performs: data read, forward, loss computation, backward, optimizer step, parameter updates. Various intermediate steps are also performed: - logging the training loss, training eta, LR, etc to loggers - logging to tensorboard, - performing any self-supervised method specific operations (like in MoCo approach, the momentum encoder is updated), computing the scores in swav - checkpointing model if user wants to checkpoint in the middle of an epoch """ assert isinstance(task, ClassyTask), "task is not instance of ClassyTask" # reset the last batch info at every step task.last_batch = LastBatchInfo() # We'll time train_step and some of its sections, and accumulate values # into perf_stats if it were defined in local_variables: perf_stats = task.perf_stats timer_train_step = PerfTimer("train_step_total", perf_stats) timer_train_step.start() # Process next sample with PerfTimer("read_sample", perf_stats): sample = next(task.data_iterator) sample = construct_sample_for_model(sample, task) # Only need gradients during training grad_context = torch.enable_grad() if task.train else torch.no_grad() ddp_context = ( task.model.no_sync() if task.enable_manual_gradient_reduction else contextlib.suppress() ) torch_amp_context = ( torch.cuda.amp.autocast() if task.amp_type == AmpType.PYTORCH else contextlib.suppress() ) with grad_context, ddp_context, torch_amp_context: # Forward pass of the model with PerfTimer("forward", perf_stats): if task.enable_manual_gradient_reduction: # Manually sync params and buffers for DDP. manual_sync_params(task.model) model_output = task.model(sample["input"]) # If the model outputs only one tensor, we take it out of the list. if len(model_output) == 1: model_output = model_output[0] task.last_batch.sample = sample task.last_batch.model_output = model_output target = sample["target"] # Run hooks on forward pass task.run_hooks(SSLClassyHookFunctions.on_forward.name) # Compute loss with PerfTimer("loss_compute", perf_stats): local_loss = task.loss(model_output, target) # Reduce the loss value across all nodes and gpus. with PerfTimer("loss_all_reduce", perf_stats): loss = local_loss.detach().clone() task.last_batch.loss = all_reduce_mean(loss) task.losses.append(task.last_batch.loss.data.cpu().item() * target.size(0)) # Update meters if len(task.meters) > 0 and ( (task.train and task.config["METERS"]["enable_training_meter"]) or (not task.train) ): with PerfTimer("meters_update", perf_stats): if isinstance(model_output, list): model_output_cpu = [x.cpu() for x in model_output] else: model_output_cpu = model_output.cpu() for meter in task.meters: meter.update(model_output_cpu, target.detach().cpu()) task.last_batch.model_output = model_output task.last_batch.target = target # Update the iteration number, check loss is not NaN and measure batch time # now if it's a test phase since test phase doesn't have update step. task.run_hooks(SSLClassyHookFunctions.on_loss_and_meter.name) # Run backward now and update the optimizer if task.train: with PerfTimer("backward", perf_stats): task.optimizer.zero_grad() if task.amp_type == AmpType.APEX: with apex.amp.scale_loss( local_loss, task.optimizer.optimizer ) as scaled_loss: scaled_loss.backward() if task.enable_manual_gradient_reduction: manual_gradient_all_reduce(task.model) elif task.amp_type == AmpType.PYTORCH: task.amp_grad_scaler.scale(local_loss).backward() if task.enable_manual_gradient_reduction: manual_gradient_all_reduce(task.model) else: local_loss.backward() if task.enable_manual_gradient_reduction: manual_gradient_all_reduce(task.model) task.run_hooks(SSLClassyHookFunctions.on_backward.name) # Stepping the optimizer also updates learning rate, momentum etc # according to the schedulers (if any). with PerfTimer("optimizer_step", perf_stats): assert task.where < 1.0, ( "Optimizer being called with where=1.0. This should not happen " "as where=1.0 means training is already finished. Please debug your " "training setup. A common issue is the data sampler resuming " "where you are checkpointing model at every iterations but not using " "the stateful data sampler OR there's an issue in properly resuming the " "data sampler." ) if task.amp_type == AmpType.PYTORCH: task.amp_grad_scaler.step(task.optimizer, where=task.where) task.amp_grad_scaler.update() else: task.optimizer.step(where=task.where) task.run_hooks(SSLClassyHookFunctions.on_update.name) task.num_updates += task.get_global_batchsize() timer_train_step.stop() timer_train_step.record() return task
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ from classy_vision.hooks import ClassyHookFunctions if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) # Only need gradients during training context = torch.enable_grad() if self.train else torch.no_grad() with context: # Forward pass local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) # NOTE: This performs an all_reduce_mean() on the losses across the # replicas. The reduce should ideally be weighted by the length of # the targets on each replica. This will only be an issue when # there are dummy samples present (once an epoch) and will only # impact the loss reporting (slightly). local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) # After both loss and meters are updated, we run hooks. Among hooks, # `LossLrMeterLoggingHook` will log both loss and meter status self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name) num_samples_in_step = self.get_global_batchsize() self.num_samples_this_phase += num_samples_in_step # For training phases, run backwards pass / update optimizer if self.train: if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss( local_variables["local_loss"], self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_variables["local_loss"]) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.run_hooks(local_variables, ClassyHookFunctions.on_update.name) self.num_updates += num_samples_in_step
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ from classy_vision.hooks import ClassyHookFunctions if local_variables is None: local_variables = {} # We'll time train_step and some of its sections, and accumulate values # into perf_stats if it were defined in local_variables: perf_stats = local_variables.get("perf_stats", None) timer_train_step = PerfTimer("train_step_total", perf_stats) timer_train_step.start() # Process next sample with PerfTimer("read_sample", perf_stats): sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"] ), (f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") self.run_hooks(local_variables, ClassyHookFunctions.on_sample.name) # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) # Only need gradients during training context = torch.enable_grad() if self.train else torch.no_grad() with context: # Forward pass with PerfTimer("forward", perf_stats): local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) model_output = local_variables["output"] target = local_variables["sample"]["target"] local_variables["local_loss"] = self.loss(model_output, target) # NOTE: This performs an all_reduce_mean() on the losses across the # replicas. The reduce should ideally be weighted by the length of # the targets on each replica. This will only be an issue when # there are dummy samples present (once an epoch) and will only # impact the loss reporting (slightly). with PerfTimer("loss_allreduce", perf_stats): local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean( local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) model_output_cpu = model_output.cpu() if use_gpu else model_output # Update meters with PerfTimer("meters_update", perf_stats): for meter in self.meters: meter.update(model_output_cpu, target.detach().cpu(), is_train=self.train) # After both loss and meters are updated, we run hooks. Among hooks, # `LossLrMeterLoggingHook` will log both loss and meter status self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name) num_samples_in_step = self.get_global_batchsize() self.num_samples_this_phase += num_samples_in_step # For training phases, run backwards pass / update optimizer if self.train: with PerfTimer("backward", perf_stats): self.optimizer.backward(local_variables["local_loss"]) self.run_hooks(local_variables, ClassyHookFunctions.on_backward.name) self.optimizer.update_schedule_on_step(self.where) with PerfTimer("optimizer_step", perf_stats): self.optimizer.step() self.run_hooks(local_variables, ClassyHookFunctions.on_update.name) self.num_updates += num_samples_in_step timer_train_step.stop() timer_train_step.record()
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) with torch.enable_grad(): # Forward pass local_variables["output"] = self.model( local_variables["sample"]["input"]) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) # Run backwards pass / update optimizer if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss(local_variables["local_loss"], self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_variables["local_loss"]) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize()