def memory_after_forward(device, context=None): """Return memory consumed by the forward pass of an extended model.""" memory_init = pytorch_current_memory_usage() torch.manual_seed(0) # MNIST dummy B = 256 X = torch.rand(B, 1, 28, 28).to(device) y = classification_targets((B, ), 10).to(device) model = torch.nn.Sequential( torch.nn.Flatten(), torch.nn.Linear(784, 10), ).to(device) model = extend(model) lossfunc = torch.nn.CrossEntropyLoss().to(device) lossfunc = extend(lossfunc) if context is None: context = nullcontext with context(): lossfunc(model(X), y) return pytorch_current_memory_usage() - memory_init
def setup(device): """Load MNIST batch, create extended CNN and loss function. Load to device. Args: device (torch.device): Device that all objects are transferred to. Returns: inputs, labels, model, loss function """ X, y = load_one_batch_mnist(batch_size=64) X, y = X.to(device), y.to(device) model = extend( Sequential( Conv2d(1, 128, 3, padding=1), ReLU(), MaxPool2d(3, stride=2), Conv2d(128, 256, 3, padding=1), ReLU(), MaxPool2d(3, padding=1, stride=2), Conv2d(256, 64, 3, padding=1), ReLU(), MaxPool2d(3, stride=2), Conv2d(64, 32, 3, padding=1), ReLU(), MaxPool2d(3, stride=2), Flatten(), Linear(32, 10), ).to(device) ) lossfunc = extend(CrossEntropyLoss().to(device)) return X, y, model, lossfunc
def backpack_individual_gradients(X, y, model, loss_func): """Individual gradients with BackPACK. Args: X (torch.Tensor): Mini-batch of shape `(N, *)` y (torch.Tensor: Labels for `X` model (torch.nn.Module): Model for forward pass loss_func (torch.nn.Module): Loss function for model prediction Returns: [torch.Tensor]: Individual gradients for samples in the mini-batch with respect to the model parameters. Arranged in the same order as `model.parameters()`. """ model = extend(model) loss_func = extend(loss_func) loss = loss_func(model(X), y) with backpack(extensions.BatchGrad()): loss.backward() individual_gradients = [p.grad_batch for p in model.parameters()] return individual_gradients
def __init__(self, obs_shape, num_actions, base_kwargs=None, extra_kwargs=None): super(Policy, self).__init__() self.use_backpack = extra_kwargs['use_backpack'] self.recurrent_hidden_state_size = 1 num_outputs = num_actions hidden_size = 512 conv_init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) lin_init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0)) self.model = nn.Sequential( conv_init_(nn.Conv2d(obs_shape[0], 32, 8, stride=4)), nn.ReLU(), conv_init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), conv_init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(), conv_init_(nn.Linear(32 * 7 * 7, hidden_size)), nn.ReLU(), lin_init_(nn.Linear(hidden_size, num_outputs))) if self.use_backpack: extend(self.model) self.model.train()
def __init__(self, arg_dict): BaseAlg.__init__(self, arg_dict) #! changed the user and item dimensions self.learner = extend(NeuMF(user_dim=100, item_dim=100, mf_dim=32, mlp_dim = [32, 16, 8], lr=1e-3).cuda()) self.lossfunc = extend(torch.nn.BCELoss()) #! changed the user embedding filename self.path = './Dataset/delicious_100.dat' self.user_feature = [] with open(self.path, 'r') as f: for line in f: if line.strip(): self.user_feature.append(torch.from_numpy(np.genfromtxt(io.StringIO(line), delimiter=" ")).to(dtype=torch.float).cuda()) else: self.user_feature.append(None) print(len(self.user_feature)) self.data = DataLoader() self.cnt = 0 self.batch = 100 torch.set_num_threads(8) torch.set_num_interop_threads(8) self.lamdba = 1 self.nu = 1 self.U = self.lamdba * torch.ones((self.learner.total_param), dtype=torch.float).cuda() self.U1 = torch.zeros((self.learner.total_param), dtype=torch.float).cuda() self.g = None self.reg = None self.t1 = time.time()
def test_convolutions_stride_issue_30(params): """ https://github.com/f-dangel/backpack/issues/30 The gradient for the convolution is wrong when `stride` is not a multiple of `D + 2*padding - dilation*(kernel-1) - 1`. """ torch.manual_seed(0) mod = torch.nn.Conv2d( in_channels=params["C_in"], out_channels=params["C_out"], kernel_size=params["K"], stride=params["S"], padding=params["pad"], dilation=params["dil"], ) backpack.extend(mod) x = torch.randn(size=(params["N"], params["C_in"], params["W"], params["H"])) with backpack.backpack(backpack.extensions.BatchGrad()): loss = torch.sum(mod(x)) loss.backward() for p in mod.parameters(): assert torch.allclose(p.grad, p.grad_batch.sum(0), rtol=1e-04, atol=1e-04)
def test_compute_hessians(network, dataset, data_type, network_type, x, y): network.train() batch_size, num_iterations = get_batch_type(data_type) batch_sampler = BatchSampler(dataset, num_iterations, batch_size) # train by iteration, not epoch data_loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=4) network_seq = get_seq_network(network_type) network_seq = copy_network(network, network_seq) criterion = nn.CrossEntropyLoss() criterion = extend(criterion) network_seq = extend(network_seq).cuda() hessians = None x = x.cuda() y = y.cuda() if data_type == 'mnist': x = x.view(len(x), -1) out = network_seq(x) loss = criterion(out, y) with backpack(DiagHessian()): loss.backward() hessians = get_hessians(network_seq, hessians) return hessians, x, y
def Diag_second_order(model, train_loader, prec0=10, device='cpu'): W = list(model.parameters())[-2] b = list(model.parameters())[-1] m, n = W.shape print("n: {} inputs to linear layer with m: {} classes".format(n, m)) lossfunc = torch.nn.CrossEntropyLoss() var0 = 1 / prec0 extend(lossfunc, debug=False) extend(model.linear, debug=False) with backpack(DiagHessian()): max_len = len(train_loader) weights_cov = torch.zeros(max_len, m, n, device=device) biases_cov = torch.zeros(max_len, m, device=device) for batch_idx, (x, y) in enumerate(train_loader): if device == 'cuda': x, y = x.cuda(), y.cuda() model.zero_grad() lossfunc(model(x), y).backward() with torch.no_grad(): # Hessian of weight W_ = W.diag_h b_ = b.diag_h #add_prior: since it will be flattened later we can just add the prior like that W_ += var0 * torch.ones(W_.size(), device=device) b_ += var0 * torch.ones(b_.size(), device=device) weights_cov[batch_idx] = W_ biases_cov[batch_idx] = b_ print("Batch: {}/{}".format(batch_idx, max_len)) print(len(weights_cov)) C_W = torch.mean(weights_cov, dim=0) C_b = torch.mean(biases_cov, dim=0) # Predictive distribution with torch.no_grad(): M_W_post = W.t() M_b_post = b C_W_post = C_W C_b_post = C_b print("M_W_post size: ", M_W_post.size()) print("M_b_post size: ", M_b_post.size()) print("C_W_post size: ", C_W_post.size()) print("C_b_post size: ", C_b_post.size()) return (M_W_post, M_b_post, C_W_post, C_b_post)
def KFLP_second_order(model, batch_size, train_loader, var0=10, device='cpu'): W = list(model.parameters())[-2] b = list(model.parameters())[-1] m, n = W.shape lossfunc = torch.nn.CrossEntropyLoss() tau = 1 / var0 extend(lossfunc, debug=False) extend(model.fc, debug=False) with backpack(KFAC()): U, V = torch.zeros(m, m, device=device), torch.zeros(n, n, device=device) B = torch.zeros(m, m, device=device) max_len = int(np.ceil(len(train_loader.dataset) / batch_size)) for batch_idx, (x, y) in enumerate(train_loader): if device == 'cuda': x, y = x.cuda(), y.cuda() model.zero_grad() lossfunc(model(x), y).backward() with torch.no_grad(): # Hessian of weight U_, V_ = W.kfac B_ = b.kfac[0] U_ = np.sqrt(batch_size) * U_ + np.sqrt(tau) * torch.eye( m, device=device) V_ = np.sqrt(batch_size) * V_ + np.sqrt(tau) * torch.eye( n, device=device) B_ = batch_size * B_ + tau * torch.eye(m, device=device) rho = min(1 - 1 / (batch_idx + 1), 0.95) U = rho * U + (1 - rho) * U_ V = rho * V + (1 - rho) * V_ B = rho * B + (1 - rho) * B_ print("Batch: {}/{}".format(batch_idx, max_len)) # Predictive distribution with torch.no_grad(): M_W_post = W.t() M_b_post = b # Covariances for Laplace U_post = torch.inverse(V) # Interchanged since W is transposed V_post = torch.inverse(U) B_post = torch.inverse(B) return (M_W_post, M_b_post, U_post, V_post, B_post)
def __init__( self, tproblem, logpath, track_interval=1, quantities=None, plot=True, plot_schedule=None, secondary_screen=False, ): """Initialize the Cockpit. Args: tproblem (deepobs.pytorch.testproblem): A DeepOBS testproblem. Alternatively, it ccould also be a general Pytorch Net. logpath (str): Path to the log file. track_interval (int, optional): Tracking rate. Defaults to 1 meaning every iteration is tracked. quantities (list, optional): List of quantities (classes or instances) that should be tracked. Defaults to None, which would use all implemented ones. plot (bool, optional): Whether results should be plotted. plot_schedule (callable): Function that maps an iteration to a boolean which determines if a plot should be created and tracked data output should be written. secondary_screen (bool): Whether to plot other experimental quantities on a secondary screen. """ # Store all parameters as attributes self.tproblem = tproblem self.logpath = logpath self.track_interval = track_interval self.quantities = quantities self.create_graph = False self.output = defaultdict(dict) # Collect quantities self.quantities = self._collect_quantities(quantities, track_interval) # Extend testproblem if isinstance(tproblem, TestProblem): extend_with_access_unreduced_loss(tproblem, detach=True) else: model, lossfunc = tproblem extend(model) extend(lossfunc) # Prepare logpath self._prepare_logpath(logpath) # Create a Cockpit Plotter instance self._plot_schedule = plot_schedule self._enable_plotting = plot if self._enable_plotting: self.cockpit_plotter = CockpitPlotter( self.logpath, secondary_screen=secondary_screen)
def test_network_diag_ggn(model_and_input): """Test whether the given module can compute diag_ggn. This test is placed here, because some models are too big to run with PyTorch. Thus, a full diag_ggn comparison with PyTorch is impossible. This test just checks whether it runs on BackPACK without errors. Additionally, it checks whether the forward pass is identical to the original model. Finally, a small number of elements of DiagGGN are compared. Args: model_and_input: module to test Raises: NotImplementedError: if loss_fn is not MSELoss or CrossEntropyLoss """ model_original, x, loss_fn = model_and_input model_original = model_original.eval() output_compare = model_original(x) if isinstance(loss_fn, MSELoss): y = regression_targets(output_compare.shape) elif isinstance(loss_fn, CrossEntropyLoss): y = classification_targets( (output_compare.shape[0], *output_compare.shape[2:]), output_compare.shape[1], ) else: raise NotImplementedError( f"test cannot handle loss_fn = {type(loss_fn)}") num_params = sum(p.numel() for p in model_original.parameters() if p.requires_grad) num_to_compare = 10 idx_to_compare = linspace(0, num_params - 1, num_to_compare, dtype=int32) diag_ggn_exact_to_compare = autograd_diag_ggn_exact(x, y, model_original, loss_fn, idx=idx_to_compare) model_extended = extend(model_original, use_converter=True, debug=True) output = model_extended(x) assert allclose(output, output_compare) loss = extend(loss_fn)(output, y) with backpack(DiagGGNExact()): loss.backward() diag_ggn_exact_vector = cat([ p.diag_ggn_exact.flatten() for p in model_extended.parameters() if p.requires_grad ]) for idx, element in zip(idx_to_compare, diag_ggn_exact_to_compare): assert allclose(element, diag_ggn_exact_vector[idx], atol=1e-5)
def get_posterior(model, train_loader, var0, mnist=False, batch_size=128): W = list(model.parameters())[-2] b = list(model.parameters())[-1] m, n = W.shape lossfunc = torch.nn.CrossEntropyLoss() tau = 1/var0 extend(lossfunc, debug=False) extend(model.linear if not mnist else model.fc2, debug=False) with backpack(KFAC()): U, V = torch.zeros(m, m, device='cuda'), torch.zeros(n, n, device='cuda') B = torch.zeros(m, m, device='cuda') # for i, (x, y) in tqdm(enumerate(train_loader)): for i, (x, y) in enumerate(train_loader): x, y = x.cuda(), y.cuda() model.zero_grad() lossfunc(model(x), y).backward() with torch.no_grad(): # Hessian of weight U_, V_ = W.kfac B_ = b.kfac[0] # U_ = sqrt(batch_size)*U_ + sqrt(tau)*torch.eye(m, device='cuda') # V_ = sqrt(batch_size)*V_ + sqrt(tau)*torch.eye(n, device='cuda') # B_ = batch_size*B_ + tau*torch.eye(m, device='cuda') rho = min(1-1/(i+1), 0.95) U = rho*U + (1-rho)*U_ V = rho*V + (1-rho)*V_ B = rho*B + (1-rho)*B_ # Predictive distribution with torch.no_grad(): M_W_post = W.t() M_b_post = b # Add priors n_data = len(train_loader.dataset) U = sqrt(n_data)*U + sqrt(tau)*torch.eye(m, device='cuda') V = sqrt(n_data)*V + sqrt(tau)*torch.eye(n, device='cuda') B = n_data*B + tau*torch.eye(m, device='cuda') # Covariances for Laplace U_post = torch.inverse(V) # Interchanged since W is transposed V_post = torch.inverse(U) B_post = torch.inverse(B) return M_W_post, M_b_post, U_post, V_post, B_post
def _preprocess(self, tproblem, backpack_debug): """Make model and loss function BackPACKable.""" extend(tproblem.net, debug=backpack_debug) tproblem._old_loss = tproblem.loss_function def hotfix_lossfunc(reduction="mean"): return extend(tproblem._old_loss(reduction=reduction), debug=backpack_debug) tproblem.loss_function = hotfix_lossfunc return tproblem
def data_prep_cifar10_small(use_sigmoid=False): model = extend(net_cifar10_3c3d_small(use_sigmoid)).to(device) lossfunc = extend(nn.CrossEntropyLoss()) dataset = datasets.CIFAR10( './data', train=True, download=True, transform=cifar_transform ) return model, lossfunc, make_loader_for_dataset(dataset)
def make_small_linear_classification_problem(): Ds = [32, 16, 4] model = torch.nn.Sequential( extend(torch.nn.Linear(Ds[0], Ds[1])), extend(torch.nn.Sigmoid()), extend(torch.nn.Linear(Ds[1], Ds[2])), ) N = 32 X = torch.randn(size=(N, Ds[0])) Y = torch.randint(high=Ds[-1], size=(N, )) lossfunc = extend(torch.nn.CrossEntropyLoss()) return TestProblem(X, Y, model, lossfunc)
def dummy_forward_pass_conv(): N, C, H, W = 2, 3, 4, 4 X = torch.randn(N, C, H, W) Y = torch.randint(high=5, size=(N,)) conv = Conv2d(3, 2, 2) lin = Linear(18, 5) model = extend(Sequential(conv, Flatten(), lin)) loss = extend(CrossEntropyLoss()) def forward(): return loss(model(X), Y) return forward, (conv.weight, lin.weight), (conv.bias, lin.bias)
def test_extension_hook_executes_on_custom_module(): """Cockpit's extension hook is only skipped for known containers like Sequential. It will thus execute on custom containers and lead to crashes whenever a quantity that uses extension hooks is used. """ manual_seed(0) N, D_in, D_out = 2, 3, 1 # NOTE Inheriting from Sequential passes class CustomModule(Module): """Custom container that is not skipped by the extension hook.""" def __init__(self): super().__init__() self.linear = Linear(D_in, D_out) self.relu = ReLU() def forward(self, x: Tensor) -> Tensor: return self.relu(self.linear(x)) uses_extension_hook = GradHist1d(linear(interval=1)) config = [uses_extension_hook] model = extend(CustomModule()) cockpit = Cockpit(model.parameters(), quantities=config) opt = SGD(model.parameters(), lr=0.1) loss_fn = extend(MSELoss(reduction="mean")) individual_loss_fn = MSELoss(reduction="none") global_step = 0 inputs, labels = rand(N, D_in), rand(N, D_out) # forward pass outputs = model(inputs) loss = loss_fn(outputs, labels) losses = individual_loss_fn(outputs, labels) # backward pass with cockpit( global_step, info={ "batch_size": N, "individual_losses": losses, "loss": loss, "optimizer": opt, }, ): loss.backward(create_graph=cockpit.create_graph(global_step))
def test_no_io(): """Check IO is not tracked.""" torch.manual_seed(0) input = torch.rand(3, 5) module = torch.nn.Linear(5, 2) extend(module) with disable(): module(input) assert not hasattr(module, "input0") assert not hasattr(module, "output") module(input) assert hasattr(module, "input0") assert hasattr(module, "output")
def __init__(self, X, Y, model, lossfunc, device=DEVICE_CPU): """ A traditional machine learning test problem, loss(model(X), Y) X: [N x D_X] Y: [N x D_Y] model: [N x D_X] -> [N x D_out] loss: [N x D_out] x [N x D_y] -> scalar """ self.X = X self.Y = Y self.model = extend(model) self.lossfunc = extend(lossfunc) self.device = device self.to(device) self.N = self.X.shape[0]
def convlayer2(conv_cls, settings): return extend( conv_cls(in_channels=settings["in_features"][0], out_channels=settings["out_channels"], kernel_size=settings["kernel_size"], padding=settings["padding"], bias=settings["bias"]))
def test_for_loop_replace() -> None: """Application of retain_graph: replace an outer for-loop. This test is based on issue #220 opened by Romain3Ch216. It computes per-component individual gradients of a tensor-valued output with a for loop over components, rather than over samples and components. """ manual_seed(0) B = 5 M = 3 h = 2 x = randn(B, h) fc = extend(Linear(h, M)) A = fc(x) grad_autograd = zeros(B, M, *fc.weight.shape) for b in range(B): for m in range(M): with backpack(retain_graph=True): grads = autograd.grad(A[b, m], fc.weight, retain_graph=True) grad_autograd[b, m] = grads[0] grad_backpack = zeros(B, M, *fc.weight.shape) for i in range(M): with backpack(BatchGrad(), retain_graph=True): A[:, i].backward(ones_like(A[:, i]), retain_graph=True) grad_backpack[:, i] = fc.weight.grad_batch check_sizes_and_values(grad_backpack, grad_autograd)
def train_model_for_label(ar, label): img_shape = (ar.channels, ar.img_size, ar.img_size) device = pt.device("cuda" if pt.cuda.is_available() else "cpu") gen = Generator(ar.latent_dim, img_shape).to(device) # Initialize generator and discriminator dis = Discriminator(img_shape).to(device) if ar.dp_noise > 0.: dis = extend(dis) dataloader, n_data = get_single_label_dataloader(ar.batch_size, label, ar.data_key) # Optimizers gen_opt = pt.optim.RMSprop(gen.parameters(), lr=ar.lr) dis_opt = pt.optim.RMSprop(dis.parameters(), lr=ar.lr) batches_done = 0 for epoch in range(ar.n_epochs): for idx, (real_imgs, _) in enumerate(dataloader): train_gen = batches_done % ar.n_critic == 0 is_final_batch = epoch + 1 == ar.n_epochs and idx + 1 == len(dataloader) log_vals = train_batch(real_imgs, device, dis_opt, gen_opt, dis, gen, ar.clip_value, train_gen, ar.dp_clip, ar.dp_noise) log_progress(log_vals, batches_done, len(dataloader), epoch, ar, label, is_final_batch) batches_done += 1 if ar.synth_data: make_synth_data(gen, n_data, device, ar.log_name, label)
def convlayer(): return extend( torch.nn.Conv2d(in_channels=TEST_SETTINGS["in_features"][0], out_channels=TEST_SETTINGS["out_channels"], kernel_size=TEST_SETTINGS["kernel_size"], padding=TEST_SETTINGS["padding"], bias=TEST_SETTINGS["bias"]))
def problem(device, request) -> Tuple[Module, Tensor, str]: """Return extended nested sequential with loss from a forward pass. Args: device: available device request: pytest request Yields: model, loss and problem_string Raises: NotImplementedError: if the problem_string is unknown """ problem_string = request.param manual_seed(0) B = 2 X = rand(B, 4).to(device) y = classification_targets((B, ), 2).to(device) if problem_string == NESTED_SEQUENTIAL: model = Sequential( Linear(4, 3, bias=False), Sequential(Linear(3, 2, bias=False), ), ) elif problem_string == CUSTOM_CONTAINER: class _MyCustomModule(Module): def __init__(self): super().__init__() self.linear1 = Linear(4, 3, bias=False) self.linear2 = Linear(3, 2, bias=False) def forward(self, x): x = self.linear1(x) x = self.linear2(x) return x model = _MyCustomModule() else: raise NotImplementedError( f"problem={problem_string} but no test setting for this.") model = extend(model.to(device)) lossfunc = extend(CrossEntropyLoss(reduction="mean").to(device)) loss = lossfunc(model(X), y) yield model, loss, problem_string
def backpack_ea_jac_t_mat_jac_prod(layer, input, mat): layer = extend(layer) derivative = derivative_from_layer(layer) # forward pass to initialize backpack buffers _ = layer(input) return derivative.ea_jac_t_mat_jac_prod(layer, None, None, mat)
def make_classification_problem(pooling_cls): model = torch.nn.Sequential(convlayer(), pooling(pooling_cls), Flatten()) Y = torch.randint(high=X.shape[1], size=(model(X).shape[0], )) lossfunc = extend(torch.nn.CrossEntropyLoss()) return TestProblem(X, Y, model, lossfunc)
def backpack_sum_hessian(layer, input, targets): layer = extend(layer) derivative = derivative_from_layer(layer) # forward pass to initialize backpack buffers _ = layer(input, targets) sum_hessian = derivative.sum_hessian(layer, None, None) return sum_hessian
def data(): N = 5 Ds = [20, 10, 3] X = randn(N, Ds[0]) Y = randint(high=Ds[-1], size=(N, )) manual_seed(0) model1 = Sequential( extend(Linear(Ds[0], Ds[1])), extend(Linear(Ds[1], Ds[2]))) manual_seed(0) model2 = Sequential( extend(LinearConcat(Ds[0], Ds[1])), extend(LinearConcat(Ds[1], Ds[2]))) loss = CrossEntropyLoss() return X, Y, model1, model2, loss
def convlayer2(join_params): conv_cls = Conv2dConcat if join_params else Conv2d return extend( conv_cls( in_channels=TEST_SETTINGS["in_features"][0], out_channels=TEST_SETTINGS["out_channels"], kernel_size=TEST_SETTINGS["kernel_size"], padding=TEST_SETTINGS["padding"], bias=TEST_SETTINGS["bias"]))
def make_regression_problem(pooling_cls): model = torch.nn.Sequential(convlayer(), pooling(pooling_cls), torch.nn.Flatten(), linearlayer()) Y = torch.randn(size=(model(X).shape[0], 1)) lossfunc = extend(torch.nn.MSELoss()) return TestProblem(X, Y, model, lossfunc)