def test_recursive_copy_to_gpu(self): tensor_a = get_mock_tensor() tensor_b = get_mock_tensor() valid_gpu_copy_value = tensor_a gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value) self.assertTrue(gpu_value.is_cuda) valid_recursive_copy_value = [[tensor_a]] gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value) self.assertTrue(gpu_value[0][0].is_cuda) valid_gpu_copy_collections = [ (tensor_a, tensor_b), [tensor_a, tensor_b], { "tensor_a": tensor_a, "tensor_b": tensor_b }, ] for value in valid_gpu_copy_collections: gpu_value = util.recursive_copy_to_gpu(value) if isinstance(value, dict): self.assertTrue(gpu_value["tensor_a"].is_cuda) self.assertTrue(gpu_value["tensor_b"].is_cuda) else: self.assertEqual(len(gpu_value), 2) self.assertTrue(gpu_value[0].is_cuda) self.assertTrue(gpu_value[1].is_cuda) value = {"a": "b"} self.assertEqual(value, util.recursive_copy_to_gpu(value))
def test_recursive_copy_to_gpu(self): tensor_a = get_mock_tensor() tensor_b = get_mock_tensor() valid_gpu_copy_value = tensor_a gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value) self.assertTrue(gpu_value.is_cuda) valid_recursive_copy_value = [[tensor_a]] gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value) self.assertTrue(gpu_value[0][0].is_cuda) valid_gpu_copy_collections = [ (tensor_a, tensor_b), [tensor_a, tensor_b], { "tensor_a": tensor_a, "tensor_b": tensor_b }, ] for value in valid_gpu_copy_collections: gpu_value = util.recursive_copy_to_gpu(value) if isinstance(value, dict): self.assertTrue(gpu_value["tensor_a"].is_cuda) self.assertTrue(gpu_value["tensor_b"].is_cuda) else: self.assertEqual(len(gpu_value), 2) self.assertTrue(gpu_value[0].is_cuda) self.assertTrue(gpu_value[1].is_cuda) invalid_gpu_copy_values = [1234, True, 1.0] for value in invalid_gpu_copy_values: with self.assertRaises(AttributeError): gpu_value = util.recursive_copy_to_gpu(value) invalid_gpu_copy_depth = [ ((((tensor_a, tensor_b), tensor_b), tensor_b), tensor_b), { "tensor_map_a": { "tensor_map_b": { "tensor_map_c": { "tensor": tensor_a } } } }, [[[[tensor_a, tensor_b], tensor_b], tensor_b], tensor_b], "abcd", # Strings are sequences, includeing single char strings ] for value in invalid_gpu_copy_depth: with self.assertRaises(ValueError): gpu_value = util.recursive_copy_to_gpu(value, max_depth=3)
def eval_step(self, use_gpu): self.last_batch = None # Process next sample sample = next(self.get_data_iterator()) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU target = sample["target"] if use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.no_grad(): output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() loss = all_reduce_mean(loss) self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo(loss=loss, output=output, target=target, sample=sample)
def train_step(self): """Train step to be executed in train loop.""" self.last_batch = None # Process next sample with Timer() as timer: sample = next(self.data_iterator) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU target = sample["target"] if self.use_gpu: sample = recursive_copy_to_gpu(sample, non_blocking=True) if self.mixup_transform is not None: sample = self.mixup_transform(sample) # Optional Pytorch AMP context torch_amp_context = (torch.cuda.amp.autocast() if self.amp_type == AmpType.PYTORCH else contextlib.suppress()) # only sync with DDP when we need to perform an optimizer step # an optimizer step can be skipped if gradient accumulation is enabled do_step = self._should_do_step() ctx_mgr_model = (self.distributed_model.no_sync() if self.distributed_model is not None and not do_step else contextlib.suppress()) ctx_mgr_loss = (self.distributed_loss.no_sync() if self.distributed_loss is not None and not do_step else contextlib.suppress()) with ctx_mgr_model, ctx_mgr_loss: # Forward pass with torch.enable_grad(), torch_amp_context: output = self.compute_model(sample) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() self.losses.append(loss.data.cpu().item()) self.update_meters(output, sample) # Backwards pass + optimizer step self.run_optimizer(local_loss) self.num_updates += self.get_global_batchsize() # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo( loss=loss, output=output, target=target, sample=sample, step_data={"sample_fetch_time": timer.elapsed_time}, )
def train_step(self): """Train step to be executed in train loop.""" self.last_batch = None # Process next sample with Timer() as timer: sample = next(self.get_data_iterator()) assert isinstance(sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys" ) # Copy sample to GPU target = sample["target"] if self.use_gpu: sample = recursive_copy_to_gpu(sample, non_blocking=True) if self.mixup_transform is not None: sample = self.mixup_transform(sample) with torch.enable_grad(): # Forward pass output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Run backwards pass / update optimizer if self.amp_args is not None: self.optimizer.zero_grad() with apex.amp.scale_loss( local_loss, self.optimizer.optimizer ) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_loss) self.check_inf_nan(loss) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize() # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo( loss=loss, output=output, target=target, sample=sample, step_data={"sample_fetch_time": timer.elapsed_time}, )
def __next__(self) -> Any: result = None with torch.cuda.stream(self.stream): if self.cache is not None: # Make sure that an ongoing transfer is done torch.cuda.current_stream().wait_stream(self.stream) result = self.cache else: result = recursive_copy_to_gpu(next(self._iter)) # Lookahead and start upload try: self.cache = recursive_copy_to_gpu(next(self._iter)) except StopIteration: self.cache = None assert result is not None return result
def train_step(self, use_gpu): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU """ self.last_batch = None # Process next sample sample = next(self.get_data_iterator()) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU target = sample["target"] if use_gpu: for key, value in sample.items(): sample[key] = recursive_copy_to_gpu(value, non_blocking=True) with torch.enable_grad(): # Forward pass output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() loss = all_reduce_mean(loss) self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Run backwards pass / update optimizer if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss(local_loss, self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_loss) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize() # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo(loss=loss, output=output, target=target, sample=sample)
def preload(self): # Get data from the iterator try: self.cache_next = next(self._iter) # Copy to the device, in a parallel CUDA stream with torch.cuda.stream(self.stream): self.cache = recursive_copy_to_gpu(self.cache_next, non_blocking=True) except StopIteration: self.cache = None return
def test_recursive_copy_to_gpu(self): tensor_a = get_mock_tensor() tensor_b = get_mock_tensor() valid_gpu_copy_value = tensor_a gpu_value = util.recursive_copy_to_gpu(valid_gpu_copy_value) self.assertTrue(gpu_value.is_cuda) valid_recursive_copy_value = [[tensor_a]] gpu_value = util.recursive_copy_to_gpu(valid_recursive_copy_value) self.assertTrue(gpu_value[0][0].is_cuda) valid_gpu_copy_collections = [ (tensor_a, tensor_b), [tensor_a, tensor_b], {"tensor_a": tensor_a, "tensor_b": tensor_b}, ] for value in valid_gpu_copy_collections: gpu_value = util.recursive_copy_to_gpu(value) if isinstance(value, dict): self.assertTrue(gpu_value["tensor_a"].is_cuda) self.assertTrue(gpu_value["tensor_b"].is_cuda) else: self.assertEqual(len(gpu_value), 2) self.assertTrue(gpu_value[0].is_cuda) self.assertTrue(gpu_value[1].is_cuda) invalid_gpu_copy_depth = [ ((((tensor_a, tensor_b), tensor_b), tensor_b), tensor_b), {"tensor_map_a": {"tensor_map_b": {"tensor_map_c": {"tensor": tensor_a}}}}, [[[[tensor_a, tensor_b], tensor_b], tensor_b], tensor_b], ] for value in invalid_gpu_copy_depth: with self.assertRaises(ValueError): gpu_value = util.recursive_copy_to_gpu(value, max_depth=3) value = {"a": "b"} self.assertEqual(value, util.recursive_copy_to_gpu(value))
def eval_step(self, use_gpu, local_variables=None): if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) with torch.no_grad(): local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name)
def eval_step(self): self.last_batch = None # Process next sample with Timer() as timer: sample = next(self.data_iterator) assert isinstance(sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys" ) target = sample["target"] if self.use_gpu: sample = recursive_copy_to_gpu(sample, non_blocking=True) # Optional Pytorch AMP context torch_amp_context = ( torch.cuda.amp.autocast() if self.amp_type == AmpType.PYTORCH else contextlib.suppress() ) with torch.no_grad(), torch_amp_context: output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() self.check_inf_nan(loss) self.losses.append(loss.data.cpu().item()) self.update_meters(output, sample) # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo( loss=loss, output=output, target=target, sample=sample, step_data={"sample_fetch_time": timer.elapsed_time}, )
def eval_step(self): self.last_batch = None # Process next sample with Timer() as timer: sample = next(self.get_data_iterator()) assert isinstance( sample, dict) and "input" in sample and "target" in sample, ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") target = sample["target"] if self.use_gpu: sample = recursive_copy_to_gpu(sample, non_blocking=True) with torch.no_grad(): output = self.model(sample["input"]) local_loss = self.compute_loss(output, sample) loss = local_loss.detach().clone() self.check_inf_nan(loss) self.losses.append(loss.data.cpu().item() * target.size(0)) self.update_meters(output, sample) # Move some data to the task so hooks get a chance to access it self.last_batch = LastBatchInfo( loss=loss, output=output, target=target, sample=sample, step_data={"sample_fetch_time": timer.elapsed_time}, )
def _get_iterator(data_iter, use_gpu): for elem in data_iter: if use_gpu: elem = recursive_copy_to_gpu(elem, non_blocking=True) yield elem["input"]
def __next__(self) -> Any: # Get data from the iterator and move to GPU # This can raise `StopIteration` return recursive_copy_to_gpu(next(self._iter), non_blocking=True)
def _get_iterator(cache, use_gpu): for elem in cache: if use_gpu: elem = recursive_copy_to_gpu(elem, non_blocking=True) yield elem
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ from classy_vision.hooks import ClassyHookFunctions if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) # Only need gradients during training context = torch.enable_grad() if self.train else torch.no_grad() with context: # Forward pass local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) # NOTE: This performs an all_reduce_mean() on the losses across the # replicas. The reduce should ideally be weighted by the length of # the targets on each replica. This will only be an issue when # there are dummy samples present (once an epoch) and will only # impact the loss reporting (slightly). local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) # After both loss and meters are updated, we run hooks. Among hooks, # `LossLrMeterLoggingHook` will log both loss and meter status self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name) num_samples_in_step = self.get_global_batchsize() self.num_samples_this_phase += num_samples_in_step # For training phases, run backwards pass / update optimizer if self.train: if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss( local_variables["local_loss"], self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_variables["local_loss"]) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.run_hooks(local_variables, ClassyHookFunctions.on_update.name) self.num_updates += num_samples_in_step
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ if local_variables is None: local_variables = {} # Process next sample sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"]), ( f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) with torch.enable_grad(): # Forward pass local_variables["output"] = self.model( local_variables["sample"]["input"]) local_variables["local_loss"] = self.compute_loss( local_variables["output"], local_variables["sample"]) local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean(local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) self.update_meters(local_variables["output"], local_variables["sample"]) # Run backwards pass / update optimizer if self.amp_opt_level is not None: self.optimizer.zero_grad() with apex.amp.scale_loss(local_variables["local_loss"], self.optimizer.optimizer) as scaled_loss: scaled_loss.backward() else: self.optimizer.backward(local_variables["local_loss"]) self.optimizer.update_schedule_on_step(self.where) self.optimizer.step() self.num_updates += self.get_global_batchsize()
def train_step(self, use_gpu, local_variables=None): """Train step to be executed in train loop Args: use_gpu: if true, execute training on GPU local_variables: Dict containing intermediate values in train_step for access by hooks """ from classy_vision.hooks import ClassyHookFunctions if local_variables is None: local_variables = {} # We'll time train_step and some of its sections, and accumulate values # into perf_stats if it were defined in local_variables: perf_stats = local_variables.get("perf_stats", None) timer_train_step = PerfTimer("train_step_total", perf_stats) timer_train_step.start() # Process next sample with PerfTimer("read_sample", perf_stats): sample = next(self.get_data_iterator()) local_variables["sample"] = sample assert ( isinstance(local_variables["sample"], dict) and "input" in local_variables["sample"] and "target" in local_variables["sample"] ), (f"Returned sample [{sample}] is not a map with 'input' and" + "'target' keys") self.run_hooks(local_variables, ClassyHookFunctions.on_sample.name) # Copy sample to GPU local_variables["target"] = local_variables["sample"]["target"] if use_gpu: for key, value in local_variables["sample"].items(): local_variables["sample"][key] = recursive_copy_to_gpu( value, non_blocking=True) # Only need gradients during training context = torch.enable_grad() if self.train else torch.no_grad() with context: # Forward pass with PerfTimer("forward", perf_stats): local_variables["output"] = self.model( local_variables["sample"]["input"]) self.run_hooks(local_variables, ClassyHookFunctions.on_forward.name) model_output = local_variables["output"] target = local_variables["sample"]["target"] local_variables["local_loss"] = self.loss(model_output, target) # NOTE: This performs an all_reduce_mean() on the losses across the # replicas. The reduce should ideally be weighted by the length of # the targets on each replica. This will only be an issue when # there are dummy samples present (once an epoch) and will only # impact the loss reporting (slightly). with PerfTimer("loss_allreduce", perf_stats): local_variables["loss"] = local_variables["local_loss"].detach( ).clone() local_variables["loss"] = all_reduce_mean( local_variables["loss"]) self.losses.append(local_variables["loss"].data.cpu().item() * local_variables["target"].size(0)) model_output_cpu = model_output.cpu() if use_gpu else model_output # Update meters with PerfTimer("meters_update", perf_stats): for meter in self.meters: meter.update(model_output_cpu, target.detach().cpu(), is_train=self.train) # After both loss and meters are updated, we run hooks. Among hooks, # `LossLrMeterLoggingHook` will log both loss and meter status self.run_hooks(local_variables, ClassyHookFunctions.on_loss_and_meter.name) num_samples_in_step = self.get_global_batchsize() self.num_samples_this_phase += num_samples_in_step # For training phases, run backwards pass / update optimizer if self.train: with PerfTimer("backward", perf_stats): self.optimizer.backward(local_variables["local_loss"]) self.run_hooks(local_variables, ClassyHookFunctions.on_backward.name) self.optimizer.update_schedule_on_step(self.where) with PerfTimer("optimizer_step", perf_stats): self.optimizer.step() self.run_hooks(local_variables, ClassyHookFunctions.on_update.name) self.num_updates += num_samples_in_step timer_train_step.stop() timer_train_step.record()