def test_resnet_resnet101(self): with _test_eager_guard(): model = resnet101(pretrained=False) egr_data = paddle.to_tensor(self.data) egr_data.stop_gradient = False egr_out = model(egr_data) egr_preds = paddle.argmax(egr_out, axis=1) egr_label_onehot = paddle.nn.functional.one_hot( paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]) egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1) egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0] egr_g_numpy = egr_g.numpy() self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape)) model = resnet101(pretrained=False) data = paddle.to_tensor(self.data) data.stop_gradient = False out = model(data) preds = paddle.argmax(out, axis=1) label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(preds), num_classes=out.shape[1]) target = paddle.sum(out * label_onehot, axis=1) g = paddle.grad(outputs=target, inputs=out)[0] g_numpy = g.numpy() self.assertEqual(list(g_numpy.shape), list(out.shape)) self.assertTrue(np.array_equal(egr_out, out)) self.assertTrue(np.array_equal(egr_g_numpy, g_numpy))
def jacobian(self, coordinates): new_coordinates = self.warp_coordinates(coordinates) # PDPD cannot use new_coordinates[..., 0] assert len(new_coordinates.shape) == 3 grad_x = paddle.grad(new_coordinates[:, :, 0].sum(), coordinates, create_graph=True) grad_y = paddle.grad(new_coordinates[:, :, 1].sum(), coordinates, create_graph=True) jacobian = paddle.concat( [grad_x[0].unsqueeze(-2), grad_y[0].unsqueeze(-2)], axis=-2) return jacobian
def finetunning(self, x_spt, y_spt, x_qry, y_qry): # assert len(x_spt.shape) == 4 query_size = x_qry.shape[0] correct_list = [0 for _ in range(self.update_step_test + 1)] new_net = deepcopy(self.net) y_hat = new_net(x_spt) loss = F.cross_entropy(y_hat, y_spt) grad = paddle.grad(loss, new_net.parameters()) fast_weights = list( map(lambda p: p[1] - self.base_lr * p[0], zip(grad, new_net.parameters()))) # 在query集上测试,计算准确率 # 这一步使用更新前的数据 with paddle.no_grad(): y_hat = new_net(x_qry, params=new_net.parameters(), bn_training=True) pred_qry = F.softmax(y_hat, axis=1).argmax(axis=1) # size = (75) correct = paddle.equal(pred_qry, y_qry).numpy().sum().item() correct_list[0] += correct # 使用更新后的数据在query集上测试。 with paddle.no_grad(): y_hat = new_net(x_qry, params=fast_weights, bn_training=True) pred_qry = F.softmax(y_hat, axis=1).argmax(axis=1) # size = (75) correct = paddle.equal(pred_qry, y_qry).numpy().sum().item() correct_list[1] += correct for k in range(1, self.update_step_test): y_hat = new_net(x_spt, params=fast_weights, bn_training=True) loss = F.cross_entropy(y_hat, y_spt) grad = paddle.grad(loss, fast_weights) fast_weights = list( map(lambda p: p[1] - self.base_lr * p[0], zip(grad, fast_weights))) y_hat = new_net(x_qry, fast_weights, bn_training=True) with paddle.no_grad(): pred_qry = F.softmax(y_hat, axis=1).argmax(axis=1) correct = paddle.equal(pred_qry, y_qry).numpy().sum().item() correct_list[k + 1] += correct del new_net accs = np.array(correct_list) / query_size return accs
def _kl_expfamily_expfamily(p, q): """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_ """ if not type(p) == type(q): raise NotImplementedError p_natural_params = [] for param in p._natural_parameters: param = param.detach() param.stop_gradient = False p_natural_params.append(param) q_natural_params = q._natural_parameters p_log_norm = p._log_normalizer(*p_natural_params) try: if _non_static_mode(): p_grads = paddle.grad(p_log_norm, p_natural_params, create_graph=True) else: p_grads = paddle.static.gradients(p_log_norm, p_natural_params) except RuntimeError as e: raise TypeError( "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})." .format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e kl = q._log_normalizer(*q_natural_params) - p_log_norm for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params, p_grads): term = (q_param - p_param) * p_grad kl -= _sum_rightmost(term, len(q.event_shape)) return kl
def entropy(self): """caculate entropy use `bregman divergence` https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf """ entropy_value = -self._mean_carrier_measure natural_parameters = [] for parameter in self._natural_parameters: parameter = parameter.detach() parameter.stop_gradient = False natural_parameters.append(parameter) log_norm = self._log_normalizer(*natural_parameters) if in_dygraph_mode(): grads = paddle.grad( log_norm.sum(), natural_parameters, create_graph=True) else: grads = paddle.static.gradients(log_norm.sum(), natural_parameters) entropy_value += log_norm for p, g in zip(natural_parameters, grads): entropy_value -= p * g return entropy_value
def test_hook_in_double_grad(self): def double_print_hook(grad): grad = grad * 2 print(grad) return grad x = paddle.ones(shape=[1], dtype='float32') x.stop_gradient = False # hook only works in backward # for forward var x, the x.grad generated in # paddle.grad will not deal with by hook x.register_hook(double_print_hook) y = x * x # Since y = x * x, dx = 2 * x dx = paddle.grad(outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0] z = y + dx self.assertTrue(x.grad is None) # If create_graph = True, the gradient of dx # would be backpropagated. Therefore, # z = x * x + dx = x * x + 2 * x, and # x.gradient() = 2 * x + 2 = 4.0 # after changed by hook: 8.0 z.backward() self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
def test_create_graph_false(self): def func(x): return paddle.matmul(x * x, self.weight)[:, 0:1] numerical_hessian = _compute_numerical_batch_hessian( func, self.x, self.numerical_delta, self.np_dtype) self.x.stop_gradient = False hessian = paddle.autograd.batch_hessian(func, self.x) assert hessian.stop_gradient == True assert np.allclose(hessian.numpy(), numerical_hessian, self.rtol, self.atol) try: paddle.grad(hessian, self.x) except RuntimeError as e: error_msg = cpt.get_exception_message(e) assert error_msg.find("has no gradient") > 0
def run_gelu_op(approximate): with dg.guard(): x = paddle.to_tensor(x_np) x.stop_gradient = False y = F.gelu(x, approximate=approximate) x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0] return y.numpy(), x_grad.numpy()
def test_check_grad(self): paddle.disable_static(place=self.place) shape = (4, 5) x_np = np.random.uniform(-1, 1, shape).astype(np.float64) x_np[0, :] = np.nan x_np[1, :3] = np.nan x_np[2, 3:] = np.nan x_np_sorted = np.sort(x_np) nan_counts = np.count_nonzero(np.isnan(x_np).astype(np.int32), axis=1) np_grad = np.zeros((shape)) for i in range(shape[0]): valid_cnts = shape[1] - nan_counts[i] if valid_cnts == 0: continue mid = int(valid_cnts / 2) targets = [x_np_sorted[i, mid]] is_odd = valid_cnts % 2 if not is_odd and mid > 0: targets.append(x_np_sorted[i, mid - 1]) for j in range(shape[1]): if x_np[i, j] in targets: np_grad[i, j] = 1 if is_odd else 0.5 x_tensor = paddle.to_tensor(x_np, stop_gradient=False) y = paddle.nanmedian(x_tensor, axis=1, keepdim=True) dx = paddle.grad(y, x_tensor)[0].numpy() self.assertTrue(np.allclose(np_grad, dx, equal_nan=True))
def adapt_gradient_descent(self, model, lr, loss, approximate=True, memo=None): # copy the function from paddlefsl.utils.gradient_descent # Maps original data_ptr to the cloned tensor. # Useful when a model uses parameters from another model. memo = set() if memo is None else set(memo) # Do gradient descent on parameters gradients = [] if len(model.layers.parameters()) != 0: gradients = paddle.grad(loss, model.layers.parameters(), retain_graph=not approximate, create_graph=not approximate, allow_unused=True) update_values = [ -lr * grad if grad is not None else None for grad in gradients ] for param, update in zip(model.layers.parameters(), update_values): if update is not None: param_ptr = id(param) if param_ptr in memo: param.set_value(param.add(update))
def get_eager_triple_grad(func, x_init=None, dy_init=None, place=None, return_mid_result=False): """ Get triple Grad result of dygraph. Args: func: A wrapped dygraph function that its logic is equal to static program x_init (numpy.array|list[numpy.array]|None): the init value for input x. dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output. place (fluid.CPUPlace or fluid.CUDAPlace): the device. return_mid_result (list[Tensor], list[Tensor]): If set True, the Returns: A list of numpy array that stores second derivative result calulated by dygraph """ dd_y, dd_x = get_eager_double_grad(func, x_init, dy_init, place, return_mid_result=True) # calcluate third derivative dddys = [] for dd_yi in dd_y: dd_yi.stop_gradient = False dddy = paddle.ones(shape=dd_yi.shape, dtype=dd_yi.dtype) dddy.stop_gradient = False dddys.append(dddy) ddd_inputs = paddle.grad(outputs=dd_y, inputs=dd_x, grad_outputs=dddys) return [ddd_input.numpy() for ddd_input in ddd_inputs]
def test_vjp_i1o1_no_create_graph(self): test_cases = [ [reduce, 'A'], #noqa [reduce_dim, 'A'], #noqa ] #noqa for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result)
def test_vjp_nested_no_create_graph(self): x = self.gen_input('a') test_cases = [ [nested(x), 'a'], #noqa ] for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result)
def forward(self, x): x.stop_gradient = False tmp = x + x for i in range(10): tmp = self.linear(tmp) out = tmp dx = paddle.grad( [out], [x], None, create_graph=True, allow_unused=False)[0] return dx
def test_vjp_i2o2_omitting_v_no_create_graph(self): test_cases = [ [o2, ['A', 'A']], #noqa ] #noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result)
def test_vjp_i2o1_no_create_graph(self): test_cases = [ [matmul, ['A', 'B']], #noqa [mul, ['b', 'c']], #noqa ] #noqa for f, inputs in test_cases: vjp, grad = self.gen_test_pairs(f, inputs) vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result)
def test_create_graph_false(self): def func(x, y): return x * y numerical_jacobian = _compute_numerical_batch_jacobian( func, [self.x, self.y], self.numerical_delta, self.np_dtype) self.x.stop_gradient = False self.y.stop_gradient = False jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) for j in range(len(jacobian)): assert jacobian[j].stop_gradient == True assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], self.rtol, self.atol) try: paddle.grad(jacobian[0], [self.x, self.y]) except RuntimeError as e: error_msg = cpt.get_exception_message(e) assert error_msg.find("has no gradient") > 0
def test_sample_reparameterized(self): mean = paddle.ones([2, 3]) logstd = paddle.ones([2, 3]) mean.stop_gradient = False logstd.stop_gradient = False norm_rep = Normal(mean=mean, logstd=logstd) samples = norm_rep.sample() mean_grads, logstd_grads = paddle.grad(outputs=[samples], inputs=[mean, logstd], allow_unused=True) self.assertTrue(mean_grads is not None) self.assertTrue(logstd_grads is not None) norm_no_rep = Normal(mean=mean, logstd=logstd, is_reparameterized=False) samples = norm_no_rep.sample() mean_grads, logstd_grads = paddle.grad(outputs=[samples], inputs=[mean, logstd], allow_unused=True) self.assertEqual(mean_grads, None) self.assertEqual(logstd_grads, None)
def grad_test(): nonlocal v xs = self.gen_inputs(inputs) if v is not None: v = self.gen_inputs(v) outputs = func(*xs) if v is not None: inputs_grad = grad( outputs, xs, v, create_graph=create_graph, allow_unused=allow_unused) else: inputs_grad = grad( outputs, xs, create_graph=create_graph, allow_unused=allow_unused) return outputs, inputs_grad
def predict_fn(data, labels): if isinstance(data, tuple): probs = self.paddle_model(*data) else: probs = self.paddle_model(data) labels_onehot = paddle.nn.functional.one_hot( paddle.to_tensor(labels), num_classes=probs.shape[1]) target = paddle.sum(probs * labels_onehot, axis=1) gradients = paddle.grad(outputs=[target], inputs=[self._embedding])[0] return gradients.numpy(), probs.numpy(), self._embedding.numpy( )
def test_checkout_grad(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): with fluid.dygraph.guard(): x_np = np.random.random((10, 10)).astype(self.dtype) x = paddle.to_tensor(x_np) x.stop_gradient = False y = fluid.layers.mean(x) dx = paddle.grad(y, x)[0].numpy() dx_expected = self.dtype(1.0 / np.prod(x_np.shape)) * np.ones( x_np.shape).astype(self.dtype) self.assertTrue(np.array_equal(dx, dx_expected))
def test_create_graph_false(self): def func(x): return paddle.sum(F.sigmoid(x)) numerical_func_output = func(self.x).numpy() numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx, self.numerical_delta, self.np_dtype) self.x.stop_gradient = False func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) assert np.allclose(func_output.numpy(), numerical_func_output, self.rtol, self.atol) assert vhp[0].stop_gradient == True assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, self.atol) try: paddle.grad(vhp, self.x) except RuntimeError as e: error_msg = cpt.get_exception_message(e) assert error_msg.find("has no gradient") > 0
def r1_reg(d_out, x_in): # zero-centered gradient penalty for real images batch_size = x_in.shape[0] grad_dout = paddle.grad(outputs=d_out.sum(), inputs=x_in, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_dout2 = grad_dout.pow(2) assert (grad_dout2.shape == x_in.shape) reg = 0.5 * paddle.reshape(grad_dout2, (batch_size, -1)).sum(1).mean(0) return reg
def predict_fn(data, labels): data = paddle.to_tensor(data) data.stop_gradient = False out = self.paddle_model(data) out = paddle.nn.functional.softmax(out, axis=1) preds = paddle.argmax(out, axis=1) if labels is None: labels = preds.numpy() labels_onehot = paddle.nn.functional.one_hot( paddle.to_tensor(labels), num_classes=out.shape[1]) target = paddle.sum(out * labels_onehot, axis=1) gradients = paddle.grad(outputs=[target], inputs=[data])[0] return gradients.numpy(), labels
def generate_gradients(self, targets, inputs): if not isinstance(targets, list): if len(self._ones_like_targets) == 0: ones_like_targets = paddle.ones_like(targets) self._ones_like_targets.append(ones_like_targets) else: ones_like_targets = self._ones_like_targets[0] else: ones_like_targets = None gradients = paddle.grad(outputs=targets, inputs=inputs, grad_outputs=ones_like_targets) return gradients
def check_resnet(self): data = np.random.rand(1, 3, 224, 224).astype(np.float32) data = paddle.to_tensor(data) data.stop_gradient = False out = self.model(data) preds = paddle.argmax(out, axis=1) label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(preds), num_classes=out.shape[1]) target = paddle.sum(out * label_onehot, axis=1) g = paddle.grad(outputs=target, inputs=out)[0] g_numpy = g.numpy() self.assertEqual(list(g_numpy.shape), list(out.shape))
def test_create_graph_true(self): def func(x): return paddle.sum(F.sigmoid(x)) numerical_hessian = _compute_numerical_hessian(func, self.x, self.numerical_delta, self.np_dtype) self.x.stop_gradient = False hessian = paddle.autograd.hessian(func, self.x, create_graph=True) assert hessian.stop_gradient == False assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, self.atol) triple_grad = paddle.grad(hessian, self.x) assert triple_grad is not None
def test_create_graph_true(self): def func(x): return paddle.matmul(x * x, self.weight)[:, 0:1] numerical_hessian = _compute_numerical_batch_hessian( func, self.x, self.numerical_delta, self.np_dtype) self.x.stop_gradient = False hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True) assert hessian.stop_gradient == False assert np.allclose(hessian.numpy(), numerical_hessian, self.rtol, self.atol) triple_grad = paddle.grad(hessian, self.x) assert triple_grad is not None
def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): paddle.set_device(device) t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) out = func(t) if use_func else paddle.nn.functional.relu(t) out.stop_gradient = False dx = paddle.grad( outputs=[out], inputs=[t], create_graph=True, retain_graph=True) dx[0].backward() assert dx[0].grad is not None return dx[0].numpy(), dx[0].grad.numpy()
def grad(self, outputs, inputs, grad_outputs=None, no_grad_vars=None, retain_graph=None, create_graph=False, allow_unused=False): return paddle.grad(outputs=outputs, inputs=inputs, grad_outputs=grad_outputs, no_grad_vars=no_grad_vars, retain_graph=retain_graph, create_graph=create_graph, allow_unused=allow_unused)