def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False): data_dict = next(data_generator) data = data_dict['data'] target = data_dict['target'] data = maybe_to_torch(data) target = maybe_to_torch(target) if torch.cuda.is_available(): data = to_cuda(data) target = to_cuda(target) self.optimizer.zero_grad() if self.fp16: with autocast(): ret = self.network(data, target, return_hard_tp_fp_fn=run_online_evaluation) if run_online_evaluation: ces, tps, fps, fns, tp_hard, fp_hard, fn_hard = ret self.run_online_evaluation(tp_hard, fp_hard, fn_hard) else: ces, tps, fps, fns = ret del data, target l = self.compute_loss(ces, tps, fps, fns) if do_backprop: self.amp_grad_scaler.scale(l).backward() self.amp_grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) self.amp_grad_scaler.step(self.optimizer) self.amp_grad_scaler.update() else: ret = self.network(data, target, return_hard_tp_fp_fn=run_online_evaluation) if run_online_evaluation: ces, tps, fps, fns, tp_hard, fp_hard, fn_hard = ret self.run_online_evaluation(tp_hard, fp_hard, fn_hard) else: ces, tps, fps, fns = ret del data, target l = self.compute_loss(ces, tps, fps, fns) if do_backprop: l.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) self.optimizer.step() return l.detach().cpu().numpy()
def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False): """ gradient clipping improves training stability :param data_generator: :param do_backprop: :param run_online_evaluation: :return: """ data_dict = next(data_generator) data = data_dict['data'] target = data_dict['target'] data = maybe_to_torch(data) target = maybe_to_torch(target) if torch.cuda.is_available(): data = to_cuda(data) target = to_cuda(target) self.optimizer.zero_grad() if self.fp16: with autocast(): output = self.network(data) del data l = self.loss(output, target) if do_backprop: self.amp_grad_scaler.scale(l).backward() self.amp_grad_scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) self.amp_grad_scaler.step(self.optimizer) self.amp_grad_scaler.update() else: output = self.network(data) del data l = self.loss(output, target) if do_backprop: l.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12) self.optimizer.step() if run_online_evaluation: self.run_online_evaluation(output, target) del target return l.detach().cpu().numpy()
def _internal_maybe_mirror_and_pred_2D( self, x: Union[np.ndarray, torch.tensor], mirror_axes: tuple, do_mirroring: bool = True, mult: np.ndarray or torch.tensor = None) -> torch.tensor: # everything in here takes place on the GPU. If x and mult are not yet on GPU this will be taken care of here # we now return a cuda tensor! Not numpy array! assert len(x.shape) == 4, 'x must be (b, c, x, y)' x = to_cuda(maybe_to_torch(x), gpu_id=self.get_device()) result_torch = torch.zeros([x.shape[0], self.num_classes] + list(x.shape[2:]), dtype=torch.float).cuda(self.get_device(), non_blocking=True) if mult is not None: mult = to_cuda(maybe_to_torch(mult), gpu_id=self.get_device()) if do_mirroring: mirror_idx = 4 num_results = 2**len(mirror_axes) else: mirror_idx = 1 num_results = 1 for m in range(mirror_idx): if m == 0: pred = self.inference_apply_nonlin(self(x)) result_torch += 1 / num_results * pred if m == 1 and (1 in mirror_axes): pred = self.inference_apply_nonlin(self(torch.flip(x, (3, )))) result_torch += 1 / num_results * torch.flip(pred, (3, )) if m == 2 and (0 in mirror_axes): pred = self.inference_apply_nonlin(self(torch.flip(x, (2, )))) result_torch += 1 / num_results * torch.flip(pred, (2, )) if m == 3 and (0 in mirror_axes) and (1 in mirror_axes): pred = self.inference_apply_nonlin(self(torch.flip(x, (3, 2)))) result_torch += 1 / num_results * torch.flip(pred, (3, 2)) if mult is not None: result_torch[:, :] *= mult return result_torch
def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False): data_dict = next(data_generator) data = data_dict['data'] target = data_dict['target'] data = maybe_to_torch(data) target = maybe_to_torch(target) if torch.cuda.is_available(): data = to_cuda(data) target = to_cuda(target) self.optimizer.zero_grad() if self.fp16: with autocast(): output = self.network(data) del data l = self.loss(output, target) if do_backprop: self.amp_grad_scaler.scale(l).backward() self.amp_grad_scaler.step(self.optimizer) self.amp_grad_scaler.update() else: output = self.network(data) del data l = self.loss(output, target) if do_backprop: l.backward() self.optimizer.step() if run_online_evaluation: self.run_online_evaluation(output, target) del target return l.detach().cpu().numpy()
def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False): raise NotImplementedError( "this class has not been changed to work with pytorch amp yet!") data_dict = next(data_generator) data = data_dict['data'] target = data_dict['target'] data = maybe_to_torch(data) target = maybe_to_torch(target) if torch.cuda.is_available(): data = to_cuda(data, gpu_id=None) target = to_cuda(target, gpu_id=None) self.optimizer.zero_grad() output = self.network(data) del data total_loss = None for i in range(len(output)): # Starting here it gets spicy! axes = tuple(range(2, len(output[i].size()))) # network does not do softmax. We need to do softmax for dice output_softmax = torch.sigmoid(output[i]) # get the tp, fp and fn terms we need tp, fp, fn, _ = get_tp_fp_fn_tn(output_softmax, target[i], axes, mask=None) # for dice, compute nominator and denominator so that we have to accumulate only 2 instead of 3 variables # do_bg=False in tuframeworkTrainer -> [:, 1:] nominator = 2 * tp[:, 1:] denominator = 2 * tp[:, 1:] + fp[:, 1:] + fn[:, 1:] if self.batch_dice: # for DDP we need to gather all nominator and denominator terms from all GPUS to do proper batch dice nominator = awesome_allgather_function.apply(nominator) denominator = awesome_allgather_function.apply(denominator) nominator = nominator.sum(0) denominator = denominator.sum(0) else: pass ce_loss = self.ce_loss(output[i], target[i]) # we smooth by 1e-5 to penalize false positives if tp is 0 dice_loss = (-(nominator + 1e-5) / (denominator + 1e-5)).mean() if total_loss is None: total_loss = self.ds_loss_weights[i] * (ce_loss + dice_loss) else: total_loss += self.ds_loss_weights[i] * (ce_loss + dice_loss) if run_online_evaluation: with torch.no_grad(): output = output[0] target = target[0] out_sigmoid = torch.sigmoid(output) out_sigmoid = (out_sigmoid > 0.5).float() if self.threeD: axes = (2, 3, 4) else: axes = (2, 3) tp, fp, fn, _ = get_tp_fp_fn_tn(out_sigmoid, target, axes=axes) tp_hard = awesome_allgather_function.apply(tp) fp_hard = awesome_allgather_function.apply(fp) fn_hard = awesome_allgather_function.apply(fn) # print_if_rank0("after allgather", tp_hard.shape) # print_if_rank0("after sum", tp_hard.shape) self.run_online_evaluation( tp_hard.detach().cpu().numpy().sum(0), fp_hard.detach().cpu().numpy().sum(0), fn_hard.detach().cpu().numpy().sum(0)) del target if do_backprop: if not self.fp16 or amp is None or not torch.cuda.is_available(): total_loss.backward() else: with amp.scale_loss(total_loss, self.optimizer) as scaled_loss: scaled_loss.backward() _ = clip_grad_norm_(self.network.parameters(), 12) self.optimizer.step() return total_loss.detach().cpu().numpy()