def __init__(self) -> None: self.prev = torch.is_anomaly_enabled() warnings.warn( 'Anomaly Detection has been enabled. ' 'This mode will increase the runtime ' 'and should only be enabled for debugging.', stacklevel=2)
def _setup_gpus(self, seed: float, detect_anomaly: bool): utils.setup_cuda(seed, self.local_rank) torch.autograd.set_detect_anomaly(detect_anomaly) self._log_info({ 'set_detect_anomaly': detect_anomaly, 'is_anomaly_enabled': torch.is_anomaly_enabled() }) self._log_info({ 'gpu_names': utils.cuda_device_names(), 'gpu_count': torch.cuda.device_count(), 'CUDA_VISIBLE_DEVICES': os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ else 'NotSet', 'cudnn.enabled': cudnn.enabled, 'cudnn.benchmark': cudnn.benchmark, 'cudnn.deterministic': cudnn.deterministic, 'cudnn.version': cudnn.version() }) self._log_info({'memory': str(psutil.virtual_memory())}) self._log_info({'CPUs': str(psutil.cpu_count())})
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # DEBUG(rjbruin) if len(targets) == 0: raise ValueError("There are still samples with zero targets!") loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() # DEBUG: only catch NaN loss if we don't have anomaly detection enabled if not torch.is_anomaly_enabled() and not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def __init__(self, mode: bool) -> None: self.prev = torch.is_anomaly_enabled() self.mode = mode torch.set_anomaly_enabled(mode) if self.mode: warnings.warn( 'Anomaly Detection has been enabled. ' 'This mode will increase the runtime ' 'and should only be enabled for debugging.', stacklevel=2) self.stream = io.BytesIO() # The original fd stderr points to. Usually 2 on POSIX systems. self.stderr_fd_origin = sys.stderr.fileno() # Make a copy of the original stderr fd in stderr_fd_copy self.stderr_fd_copy = os.dup(self.stderr_fd_origin) # Create a temporary file and redirect stderr to it self.tfile = tempfile.TemporaryFile(mode='w+b') self._redirect_stderr(self.tfile.fileno())
def __init__(self, mode): self.prev = torch.is_anomaly_enabled() torch.set_anomaly_enabled(mode)
def __init__(self): self.prev = torch.is_anomaly_enabled()
def __init__(self, mode: bool) -> None: self.prev = torch.is_anomaly_enabled() torch.set_anomaly_enabled(mode)
def __init__(self, mode: bool, check_nan: bool = True) -> None: self.prev = torch.is_anomaly_enabled() self.prev_check_nan = torch.is_anomaly_check_nan_enabled() torch.set_anomaly_enabled(mode, check_nan)
def __init__(self, mode): self.prev = torch.is_anomaly_enabled() torch.set_anomaly_enabled(mode)
def __init__(self): self.prev = torch.is_anomaly_enabled()