def test_parameter_list_dict_replica(self): class MyMod(torch.nn.Module): def __init__(self, data): super(MyMod, self).__init__() self.data = data def forward(self, inp): return inp p1 = torch.nn.Parameter(torch.rand(10)) p2 = torch.nn.Parameter(torch.rand(10)) module = MyMod(torch.nn.ParameterList([p1, p2])).cuda() model = dp.DataParallel(module) input = torch.randn((8, 8), device="cuda") with self.assertWarnsRegex( UserWarning, r"nn\.ParameterList is being used with DataParallel but this"): model(input) module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2})).cuda() model = dp.DataParallel(module) input = torch.randn((8, 8), device="cuda") with self.assertWarnsRegex( UserWarning, r"nn\.ParameterDict is being used with DataParallel but this"): model(input)
def test_autocast(self): class Model(torch.nn.Linear): def __init__(self): super(Model, self).__init__(8, 8) @torch.cuda.amp.autocast() def forward(self, input): return super(Model, self).forward(input) model = dp.DataParallel(Model().cuda().to(dtype=torch.float32)) input = torch.randn((8, 8), dtype=torch.float32, device="cuda") self.assertTrue(model(input).dtype is torch.float16)
def test_parameter_list_dict_replica(self): class MyMod(torch.nn.Module): def __init__(self, data, check_fn): super(MyMod, self).__init__() self.data = data self.check_fn = check_fn def forward(self, inp): self.check_fn(self) return inp p1 = torch.nn.Parameter(torch.rand(10)) p2 = torch.nn.Parameter(torch.rand(10)) key0 = 0 key1 = 1 def check_fn(self_): self.assertEqual(p1, self_.data[key0]) self.assertEqual(p2, self_.data[key1]) self.assertTrue(self_.data[key0].requires_grad) self.assertTrue(self_.data[key1].requires_grad) self.assertIsNotNone(self_.data[key0].grad_fn) self.assertIsNotNone(self_.data[key1].grad_fn) module = MyMod(torch.nn.ParameterList([p1, p2]), check_fn).cuda() model = dp.DataParallel(module) input = torch.randn((8, 8), device="cuda") # Runs the check_fn model(input) key0 = "0" key1 = "1" module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2}), check_fn).cuda() model = dp.DataParallel(module) input = torch.randn((8, 8), device="cuda") # Runs the check_fn model(input)
def _setup(self) -> None: self._no_grad_context = contextlib.ExitStack() self._no_grad_context.enter_context(torch.no_grad()) if get_pkg_version("torch").startswith("1.9"): # inference mode is required for PyTorch version 1.9.* self._no_grad_context.enter_context(torch.inference_mode()) self._configure() model = self._load_model() model.eval() if self._on_gpu: self._model = parallel.DataParallel(model) torch.cuda.empty_cache() else: self._model = model raw_predict_fn = getattr(self._model, self._predict_fn_name) self._predict_fn = functools.partial(raw_predict_fn, **self._partial_kwargs)
def test_zero_grad(self): # zero_grad should warn about using gradients inside forward class Net(torch.nn.Module): def __init__(self, testcase): super(Net, self).__init__() self._testcase = testcase def forward(self, x): with self._testcase.assertWarnsRegex( UserWarning, r"Calling \.zero_grad\(\) from a module created with nn\.DataParallel\(\) has no effect."): self.zero_grad() return x module = Net(self).cuda() dpm = dp.DataParallel(module) dpm(torch.rand(4, 3, 6, 5))
def test_zero_grad(self): # zero_grad should warn about using gradients inside forward class Net(torch.nn.Module): def __init__(self, testcase): super(Net, self).__init__() self._testcase = testcase def forward(self, x): self._testcase.assertWarnsRegex( lambda: self.zero_grad(), "The parameters in data parallel modules are copied from the original module." ) return x module = Net(self).cuda() dpm = dp.DataParallel(module) dpm(torch.rand(4, 3, 6, 5))
def _decorate_model(self, parallel_decorate=True): self.logging('=' * 20 + 'Decorate Model' + '=' * 20) if self.setting.fp16: self.model.half() self.model.to(self.device) self.logging('Set model device to {}'.format(str(self.device))) if parallel_decorate: if self.in_distributed_mode(): self.model = para.DistributedDataParallel( self.model, device_ids=[self.setting.local_rank], output_device=self.setting.local_rank) self.logging('Wrap distributed data parallel') # self.logging('In Distributed Mode, but do not use DistributedDataParallel Wrapper') elif self.n_gpu > 1: self.model = para.DataParallel(self.model) self.logging('Wrap data parallel') else: self.logging('Do not wrap parallel layers')
def test_strided_grad_layout(self): class ConvNet(nn.Module): def __init__(self, layouts, dtypes): super(ConvNet, self).__init__() self.dtypes = dtypes self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to( memory_format=layouts[0], dtype=dtypes[0]) self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to( memory_format=layouts[1], dtype=dtypes[1]) self.conv2 = torch.nn.Conv2d(32, 16, (2, 2)).to( memory_format=layouts[2], dtype=dtypes[2]) self.conv3 = torch.nn.Conv2d(16, 8, (2, 2)).to( memory_format=layouts[3], dtype=dtypes[3]) def forward(self, x): x = x.to(self.dtypes[0]) x = self.conv0(x).to(self.dtypes[1]) x = self.conv1(x).to(self.dtypes[2]) x = self.conv2(x).to(self.dtypes[3]) x = self.conv3(x) return x layer_formats = ( [torch.contiguous_format] * 4, [torch.channels_last] * 2 + [torch.contiguous_format] * 2, [torch.channels_last] * 4, ) layer_dtypes = ( [torch.float] * 4, [torch.float] * 2 + [torch.half] * 2, [torch.half] * 4, ) ndevs = torch.cuda.device_count() input = torch.randn(ndevs * 8, 8, 8, 8, device="cuda:0", dtype=torch.float) target = torch.randn(ndevs * 8, 8, 4, 4, device="cuda:0", dtype=torch.float) device_ids = list(range(ndevs)) with torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False): for formats, dtypes in product(layer_formats, layer_dtypes): model_msg = "formats = {} dtypes = {}".format(formats, dtypes) try: m = ConvNet(formats, dtypes).cuda(device="cuda:0") m_dp = dp.DataParallel(deepcopy(m), device_ids=device_ids) opt = torch.optim.SGD(m.parameters(), lr=0.1) opt_dp = torch.optim.SGD(m_dp.parameters(), lr=0.1) has_half = any(p.dtype is torch.half for p in m.parameters()) tol = 1.e-3 if has_half else 1.e-5 except BaseException: # Prints case-specific debugging info to narrow down failing case. print("Caught exception during model creation for " + model_msg, flush=True) raise # 2 iters: First iter creates grads, second iter tries zeroed grads. for it in range(2): iter_msg = "iter = {} ".format(it) + model_msg named_msg = iter_msg try: F.mse_loss(m(input).float(), target).backward() F.mse_loss(m_dp(input).float(), target).backward() for i, ((layer_name, m_child), m_dp_child) in enumerate( zip(m.named_children(), m_dp.module.children())): named_msg = layer_name + ".weight " + iter_msg self.assertTrue( m_child.weight.grad.is_contiguous( memory_format=formats[i]), named_msg) self.assertTrue( m_dp_child.weight.grad.is_contiguous( memory_format=formats[i]), named_msg) for j, ((param_name, p), p_dp) in enumerate( zip(m_child.named_parameters(), m_dp_child.parameters())): named_msg = layer_name + "." + param_name + " " + iter_msg self.assertEqual(p.grad, p_dp.grad, rtol=tol, atol=tol) opt.step() opt_dp.step() opt.zero_grad() opt_dp.zero_grad() except BaseException: # Makes sure we still get info if an error occurred somewhere other than the asserts. print("Caught exception during iterations at " + named_msg, flush=True) raise
def __get_train_net(self): if self.__use_cuda: return parallel.DataParallel(self.__net) else: return self.__net
def train_on_cifar(model, schedule, batch_size=128, name=None, show_test=False, force_multi_gpu=False, half_precision=False, loss_scale=1): torch.backends.cudnn.benchmark = True if name: print(f'Training: {name}') start_time = datetime.now() train = load_cifar10(train=True) all_test = load_cifar10(train=False) test = SublistDataset(all_test, 1000, 10000) dev = SublistDataset(all_test, 0, 1000) dev_small = SublistDataset(all_test, 0, 200) loss = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() if half_precision: model = network_to_half(model) if batch_size >= 1024 or force_multi_gpu: model = parallel.DataParallel(model) train_loader = DataLoader(train, batch_size=batch_size, num_workers=2) metrics = union_metric(accuracy_metric(model, dev_small), loss_metric(model, dev_small, loss)) epoch_counter = 0 # workaround for https://github.com/tqdm/tqdm/issues/481 tqdm.monitor_interval = 0 progress = tqdm(total=schedule.total_duration() * len(train_loader), ncols=120) vis = visdom.Visdom() visdom_win = None def on_epoch_start(step, epoch): nonlocal epoch_counter, visdom_win, metrics postfix = collections.OrderedDict() postfix['step'] = f'{step["name"]}' postfix['epoch'] = f'{epoch}/{step["duration"]}' model.train(False) epoch_metrics = metrics() postfix.update(epoch_metrics) progress.set_postfix(ordered_dict=postfix) acc = epoch_metrics['accuracy'] if not visdom_win: visdom_win = vis.line(np.array([acc]), np.array([0]), opts=dict(title=name or 'Accuracy')) else: vis.line(np.array([acc]), np.array([epoch_counter]), win=visdom_win, update='append') epoch_counter += 1 def on_step(): progress.update(1) schedule.train(model, loss, train=train_loader, dev=dev, on_step=on_step, on_epoch_start=on_epoch_start, half_precision=half_precision, loss_scale=loss_scale) progress.close() total_time = datetime.now() - start_time def get_accuracy_on(dataset): return accuracy_metric(model, dataset)()['accuracy'] def print_acc(name, accuracy): print(f'{name} accuracy: {accuracy:.3f}') dev_accuracy = get_accuracy_on(dev) print_acc('Dev', dev_accuracy) train_accuracy = get_accuracy_on(train) print_acc('Train', train_accuracy) if show_test: test_accuracy = get_accuracy_on(test) print_acc('Test', test_accuracy) else: test_accuracy = -1.0 print(f'It took {total_time} to train') with open('experiments.txt', mode='a') as f: parameters = f'batch_size={batch_size}' experiment_data = f'{datetime.now()}: {name}[{parameters}]. ' \ f'Time={total_time}. Train={train_accuracy:.3f}. Dev={dev_accuracy:.3f}. Test={test_accuracy:.3f}\n' f.write(experiment_data)
def train(**kwargs): # 1. configure model cfg._parse(kwargs) model = MyNet() if cfg.load_model_path: model.load_state_dict(torch.load(cfg.load_model_path)) if cfg.multi_gpu: model = parallel.DataParallel(model) if cfg.use_gpu: model.cuda() # 2. prepare data train_data = SN(root=cfg.train_data_root, crop_size=cfg.crop_size) train_loader = DataLoader(train_data, batch_size=cfg.batch_size, shuffle=True) # 3. criterion (already imported) and optimizer lr = cfg.lr # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=cfg.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=cfg.momentum) # 4. meters loss_meter = meter.AverageValueMeter() previous_loss = 1e10 # train for epoch in range(cfg.max_epoch): print('epoch %s: ===========================' % epoch) loss_meter.reset() for ii, (data, label_group) in tqdm(enumerate(train_loader)): # train model if cfg.use_gpu: data = data.cuda() label_group = [label.cuda() for label in label_group] data = Variable(data).float() label_group = [Variable(label) for label in label_group] optimizer.zero_grad() score = model(data) # for item in score: # print(item) loss = criterion(score, label_group, batch_size=cfg.batch_size, neg_pos_ratio=cfg.neg_pos_ratio) loss.backward() optimizer.step() # meters update and print loss_meter.add(loss.item()) if (ii + 1) % cfg.print_freq == 0: print(loss_meter.value()[0]) if (epoch + 1) % cfg.save_freq == 0: torch.save(model.module.state_dict(), f'./checkpoints/last.pth') # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * cfg.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]