Exemple #1
0
 def __init__(self) -> None:
     self.prev = torch.is_anomaly_enabled()
     warnings.warn(
         'Anomaly Detection has been enabled. '
         'This mode will increase the runtime '
         'and should only be enabled for debugging.',
         stacklevel=2)
Exemple #2
0
    def _setup_gpus(self, seed: float, detect_anomaly: bool):
        utils.setup_cuda(seed, self.local_rank)

        torch.autograd.set_detect_anomaly(detect_anomaly)
        self._log_info({
            'set_detect_anomaly': detect_anomaly,
            'is_anomaly_enabled': torch.is_anomaly_enabled()
        })

        self._log_info({
            'gpu_names':
            utils.cuda_device_names(),
            'gpu_count':
            torch.cuda.device_count(),
            'CUDA_VISIBLE_DEVICES':
            os.environ['CUDA_VISIBLE_DEVICES']
            if 'CUDA_VISIBLE_DEVICES' in os.environ else 'NotSet',
            'cudnn.enabled':
            cudnn.enabled,
            'cudnn.benchmark':
            cudnn.benchmark,
            'cudnn.deterministic':
            cudnn.deterministic,
            'cudnn.version':
            cudnn.version()
        })
        self._log_info({'memory': str(psutil.virtual_memory())})
        self._log_info({'CPUs': str(psutil.cpu_count())})
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # DEBUG(rjbruin)
        if len(targets) == 0:
            raise ValueError("There are still samples with zero targets!")

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        # DEBUG: only catch NaN loss if we don't have anomaly detection enabled
        if not torch.is_anomaly_enabled() and not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
Exemple #4
0
 def __init__(self, mode: bool) -> None:
     self.prev = torch.is_anomaly_enabled()
     self.mode = mode
     torch.set_anomaly_enabled(mode)
     if self.mode:
         warnings.warn(
             'Anomaly Detection has been enabled. '
             'This mode will increase the runtime '
             'and should only be enabled for debugging.',
             stacklevel=2)
         self.stream = io.BytesIO()
         # The original fd stderr points to. Usually 2 on POSIX systems.
         self.stderr_fd_origin = sys.stderr.fileno()
         # Make a copy of the original stderr fd in stderr_fd_copy
         self.stderr_fd_copy = os.dup(self.stderr_fd_origin)
         # Create a temporary file and redirect stderr to it
         self.tfile = tempfile.TemporaryFile(mode='w+b')
         self._redirect_stderr(self.tfile.fileno())
Exemple #5
0
 def __init__(self, mode):
     self.prev = torch.is_anomaly_enabled()
     torch.set_anomaly_enabled(mode)
Exemple #6
0
 def __init__(self):
     self.prev = torch.is_anomaly_enabled()
Exemple #7
0
 def __init__(self, mode: bool) -> None:
     self.prev = torch.is_anomaly_enabled()
     torch.set_anomaly_enabled(mode)
Exemple #8
0
 def __init__(self, mode: bool, check_nan: bool = True) -> None:
     self.prev = torch.is_anomaly_enabled()
     self.prev_check_nan = torch.is_anomaly_check_nan_enabled()
     torch.set_anomaly_enabled(mode, check_nan)
 def __init__(self, mode):
     self.prev = torch.is_anomaly_enabled()
     torch.set_anomaly_enabled(mode)
 def __init__(self):
     self.prev = torch.is_anomaly_enabled()