def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] z = self._out weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) wminus = nd.minimum(0., weight) bplus = None bminus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) bminus = nd.minimum(0., bias) alpha = z > 0. beta = z < 0. a.attach_grad() with autograd.record(): zplus = self._forward(data=a, weight=wplus, bias=bplus) cplus, = autograd.grad(zplus, a, head_grads=alpha * R / (zplus + (zplus == 0.))) with autograd.record(): zminus = self._forward(data=a, weight=wminus, bias=bminus) cminus, = autograd.grad(zminus, a, head_grads=beta * R / (zminus + (zminus == 0.))) return a * (cplus - cminus)
def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] z = self._out weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) wminus = nd.minimum(0., weight) bplus = None bminus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) bminus = nd.minimum(0., bias) alpha = z > 0. beta = z < 0. a.attach_grad() with autograd.record(): zplus = self._forward(data=a, weight=wplus, bias=bplus) cplus, = autograd.grad(zplus, a, head_grads=alpha*R/(zplus + (zplus == 0.))) with autograd.record(): zminus = self._forward(data=a, weight=wminus, bias=bminus) cminus, = autograd.grad(zminus, a, head_grads=beta*R/(zminus + (zminus == 0.))) return a*(cplus - cminus)
def test_dense_backward_no_flatten(): print("2nd order gradient for Fully Connected, flatten=False") for x in NDArrayGenerator(5, 3): hidden = random.randrange(1, 4) net = gluon.nn.Sequential() with net.name_scope(): net.add(gluon.nn.Dense(hidden, flatten=False)) net.initialize(mxnet.initializer.Constant(.5)) x.attach_grad() with autograd.record(): y = net.forward(x) o_y = arange_shape_like(y) # head gradient of y params = [p.data() for p in net.collect_params().values()] w = params[0] b = params[1] print("Checking y ({}) = x({}) * w^T({}) + b({})".format( y.shape, x.shape, w.shape, b.shape)) x_grad = autograd.grad(heads=y, variables=x, head_grads=o_y, create_graph=True, retain_graph=True)[0] o_x_grad = arange_shape_like(x_grad) w_grad_grad = autograd.grad(heads=x_grad, variables=w, head_grads=o_x_grad, create_graph=False)[0] w_grad = autograd.grad(heads=y, variables=w, head_grads=o_y, create_graph=True, retain_graph=True)[0] o_w_grad = arange_shape_like(w_grad) x_grad_grad = autograd.grad(heads=w_grad, variables=x, head_grads=o_w_grad, create_graph=False)[0] # Expected results o_y = flatten2d_left(o_y) x = flatten2d_left(x) o_x_grad = flatten2d_left(o_x_grad) o_w_grad = flatten2d_left(o_w_grad) w_grad_e = nd.dot(o_y, x, transpose_a=True) w_grad_grad_e = nd.dot(o_y, o_x_grad, transpose_a=True) x_grad_e = nd.dot(o_y, w) x_grad_grad_e = nd.dot(o_y, o_w_grad) w_grad_check = same(flatten2d_left(w_grad), flatten2d_left(w_grad_e)) w_grad_grad_check = same(flatten2d_left(w_grad_grad), flatten2d_left(w_grad_grad_e)) x_grad_check = same(flatten2d_left(x_grad), flatten2d_left(x_grad_e)) x_grad_grad_check = same(flatten2d_left(x_grad_grad), flatten2d_left(x_grad_grad_e)) ok_(x_grad_check) ok_(w_grad_check) ok_(x_grad_grad_check) ok_(w_grad_grad_check)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dw = in_grad[1] db = in_grad[2] dy = out_grad[0] x = in_data[0] w = in_data[1] b = in_data[2] y = out_data[0] int_x, x_shift_bit = self.int_quantize(x) int_w, int_b, w_shift_bit = self.int_quantize_double(w, b) int_x.attach_grad(), int_w.attach_grad(), int_b.attach_grad() with autograd.record(): y[:] = mx.nd.add(mx.nd.dot(int_x, int_w.T), int_b) dx, dw, db = autograd.grad(y, [int_x, int_w, int_b], dy, retain_graph=True) #print('dx_origin:', dx) #print('dw_origin:', dw) #print('db_origin:', db) #print(x_shift_bit, w_shift_bit) #print('dx:', dx/(2**x_shift_bit), ) #print('dw:', dw/(2**w_shift_bit), ) #print('db:', db/(2**w_shift_bit)) self.assign(in_grad[0], req[0], dx / (2**x_shift_bit)) self.assign(in_grad[1], req[0], dw / (2**w_shift_bit)) self.assign(in_grad[2], req[0], db / (2**w_shift_bit))
def get_gradient(crit, real, fake, epsilon): mixed_images = epsilon * real + (1 - epsilon) * fake mixed_images.attach_grad() with autograd.record(): mixed_scores = crit(mixed_images) grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True)[0] return grad
def check_second_order_unary(x, op, grad_grad_op, rtol=None, atol=None): x = nd.array(x) grad_grad_x = grad_grad_op(x) x.attach_grad() # Manual head_grads. y_grad = nd.random.normal(shape=x.shape) head_grad_grads = nd.random.normal(shape=x.shape) # Perform compute. with autograd.record(): y = op(x) x_grad = autograd.grad(heads=y, variables=x, head_grads=y_grad, create_graph=True, retain_graph=True)[0] x_grad.backward(head_grad_grads) # Compute expected values. expected_grad_grad = grad_grad_x.asnumpy() * head_grad_grads.asnumpy() * \ y_grad.asnumpy() # Validate the gradients. assert_almost_equal(expected_grad_grad, x.grad.asnumpy(), rtol=rtol, atol=atol)
def jacobian_autograd(x, y): jac = [] for i in range(y.shape[1]): with autograd.record(): yi = y[:, i] dyidx = autograd.grad(yi, [x], create_graph=True)[0] jac += [nd.expand_dims(dyidx, 1)] return nd.concatenate(jac, 1)
def get_gradient(crit, real, fake, epsilon): mixed_images = epsilon * real + (1 - epsilon) * fake mixed_images.attach_grad() # with autograd.record(): mixed_scores = crit(mixed_images) grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True, create_graph=True, head_grads=nd.ones_like(mixed_scores))[0] return grad
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dgamma = in_grad[1] dbeta = in_grad[2] x = in_data[0] gamma = in_data[1] beta = in_data[2] mean = in_data[3] var = in_data[4] new_gamma = in_data[5] new_beta = in_data[6] y_shift_bit = in_data[7] last_shift_bit = in_data[8] y = out_data[0] dy = out_grad[0] mean = nd.mean(x, axis=(0, 2, 3)) var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3))) quan_gamma = gamma / (nd.sqrt(var + self.eps)) quan_beta = beta - mean * gamma / nd.sqrt(var + self.eps) # quan_gamma = nd.clip(nd.floor(nd.log2(quan_gamma)), a_min=-3, a_max=0) # quan_gamma = 2**(quan_gamma) quan_gamma = quan_gamma * (2**last_shift_bit) # quan_beta, beta_shift_bit = self.int_quantize(quan_beta) quan_gamma, quan_beta, gamma_shift_bit = self.int_quantize_double( quan_gamma, quan_beta) x.attach_grad(), quan_gamma.attach_grad(), quan_beta.attach_grad() # print(quan_gamma) with autograd.record(): y = nd.BatchNorm(x, gamma=quan_gamma, beta=quan_beta, moving_mean=nd.zeros(shape=mean.shape), moving_var=nd.ones(shape=var.shape), eps=self.eps, momentum=self.momentum, fix_gamma=False, name=self.name) y, y_shift_bit = self.int_quantize(y) # print(quan_gamma) dx, dgamma, dbeta = autograd.grad(y, [x, quan_gamma, quan_beta], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx / 2**y_shift_bit) self.assign(in_grad[1], req[0], dgamma / 2**(gamma_shift_bit + last_shift_bit)) self.assign(in_grad[2], req[0], dbeta / 2**gamma_shift_bit) self.assign(in_data[5], req[0], quan_gamma) self.assign(in_data[6], req[0], quan_beta) self.assign(in_data[7], req[0], y_shift_bit)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dw = in_grad[1] x = in_data[0] w = in_data[1] dy = out_grad[0] x, x_shift_bit = self.int_quantize(x) w, w_shift_bit = self.int_quantize(w) y = out_data[0] if self.no_bias: x.attach_grad(), w.attach_grad() with autograd.record(): y[:] = nd.Convolution(data=x, weight=w, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw = autograd.grad(y, [x, w], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx / (2**x_shift_bit)) self.assign(in_grad[1], req[0], dw / (2**w_shift_bit)) else: b = in_data[2] b, b_shift_bit = self.int_quantize(b) x.attach_grad(), w.attach_grad(), b.attach_grad() with autograd.record(): y[:] = nd.Convolution(data=x, weight=w, bias=b, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx / (2**x_shift_bit)) self.assign(in_grad[1], req[0], dw / (2**w_shift_bit)) self.assign(in_grad[2], req[0], db / (2**b_shift_bit))
def check_second_order_unary(x, op, grad_grad_op): x = nd.array(x) expect_grad_grad = grad_grad_op(x) x.attach_grad() with autograd.record(): y = op(x) y_grad = autograd.grad(y, x, create_graph=True, retain_graph=True)[0] y_grad.backward() assert_almost_equal(expect_grad_grad.asnumpy(), x.grad.asnumpy())
def compute_gradients(self, elbo: nd.NDArray, data_batch: mx.io.DataBatch = None, log_q_sum: nd.NDArray = None, mode: str = 'train') -> None: """Compute gradients and assign them to variational parameters. Args: elbo: evidence lower bound that we maximize data_batch: minibatch of data with data indices as labels log_q_sum: sum of log probs of samples from variational distributions q. """ cfg = self.gradient_config if cfg['estimator'] == 'pathwise': for block in self.sequential._children: for child_block in block._children: if hasattr(child_block, 'is_reparam'): assert child_block.is_reparam == True if len(self._point_mass_params) > 0 and mode == 'train': variables = [p.data() for p in self._point_mass_params] assert elbo.shape[-1] == cfg['batch_size'] loss = nd.mean(-elbo, -1) point_mass_grads = autograd.grad(loss, variables, retain_graph=True) _assign_grads(self._point_mass_params, point_mass_grads) if cfg['estimator'] == 'pathwise': (-elbo).backward() elif cfg['estimator'] == 'score_function': variables = [param.repeated for param in self._score_params] score_functions = autograd.grad(log_q_sum, variables) mx.autograd.set_recording(False) score_grads = [] for param, score_function in zip(self._score_params, score_functions): grad = _leave_one_out_gradient_estimator(score_function, -elbo) if 'emb' in param.name: # turns out the sparse implementation is not faster?! # data, label = data_batch # label = label.astype(np.int64) # grad = nd.sparse.row_sparse_array( # grad, indices=label, shape=param.shape) # need to broadcast for embeddings one_hot = nd.one_hot(data_batch[1], depth=self.n_data) grad = nd.dot(one_hot, grad, transpose_a=True) score_grads.append(grad) _assign_grads(self._score_params, score_grads)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dy = out_grad[0] * 2**out_data[1] x = in_data[0] / 2**in_data[3] w = in_data[1] b = in_data[2] x.attach_grad(), w.attach_grad() if self.no_bias: with autograd.record(): y = nd.Convolution( data=x, weight=w, # bias=b_int, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw = autograd.grad(y, [x, w], dy, retain_graph=True) else: with autograd.record(): y = nd.Convolution(data=x, weight=w, bias=b_int, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx) self.assign(in_grad[1], req[1], dw) if not self.no_bias: self.assign(in_grad[2], req[2], db)
def test_autograd(): a = nd.ones((1, 1)) * 0.05 b = nd.ones((1, 1)) q_a = (a * 256).floor() q_b = (b * 256).floor() a.attach_grad(), b.attach_grad() c = a * b c.attach_grad() with autograd.record(): loss = (1 - c)**2 / 2 q_c = q_a * q_b / 256 / 256 q_loss = (1 - q_c)**2 / 2 da, db, dc = autograd.grad(loss, [a, b, c], 1 - q_loss, retain_graph=True)
def layerwise_relevance_wsquare(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0].ones_like() weight = self.weight.data(ctx=a.context) wsquare = weight**2 bsquare = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bsquare = bias**2 a.attach_grad() with autograd.record(): z = self._forward(data=a, weight=wsquare, bias=bsquare) c, = autograd.grad(z, a, head_grads=R / (z + (z == 0.))) return c
def layerwise_relevance_zplus(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) bplus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) a.attach_grad() with autograd.record(): z = self._forward(data=a, weight=wplus, bias=bplus) c, = autograd.grad(z, a, head_grads=R/(z + (z == 0.))) return a*c
def layerwise_relevance_zplus(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) bplus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) a.attach_grad() with autograd.record(): z = self._forward(data=a, weight=wplus, bias=bplus) c, = autograd.grad(z, a, head_grads=R / (z + (z == 0.))) return a * c
def layerwise_relevance_wsquare(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0].ones_like() weight = self.weight.data(ctx=a.context) wsquare = weight**2 bsquare = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bsquare = bias**2 a.attach_grad() with autograd.record(): z = self._forward(data=a, weight=wsquare, bias=bsquare) c, = autograd.grad(z, a, head_grads=R/(z + (z == 0.))) return c
def get_crit_loss(gen, crit, real, batch_size, z_dim, ctx): z = nd.random.randn(batch_size, z_dim, 1, 1, ctx=ctx) fake = gen(z).detach() y_pred_fake = crit(fake).reshape(real.shape[0], -1) y_pred_real = crit(real).reshape(real.shape[0], -1) epsilon = np.random.rand(len(real), 1, 1, 1) epsilon = nd.array(epsilon, ctx=ctx) # grad = get_gradient(crit, X, Xhat.detach(), epsilon) mixed_images = epsilon * real + (1 - epsilon) * fake mixed_images.attach_grad() # with autograd.record(): mixed_scores = crit(mixed_images) grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True, create_graph=True, head_grads=nd.ones_like(mixed_scores))[0] gp = gradient_penalty(grad) crit_loss = crit_loss_fn(y_pred_fake, y_pred_real, gp, C_LAMBDA) return crit_loss
def calc_gradient_penalty(netD, real_data, fake_data, LAMBDA, ctx): real_data = real_data.as_in_context(ctx) b_s = real_data.shape[0] alpha = nd.random.uniform(0, 1, shape=(b_s, 1, 1, 1), ctx=ctx) alpha = alpha.broadcast_to(real_data.shape) interpolates = alpha * real_data + ((1 - alpha) * fake_data) interpolates = nd.array(interpolates) interpolates.attach_grad() disc_interpolates = netD(interpolates) gradients = autograd.grad(heads=disc_interpolates, variables=interpolates, head_grads=nd.ones(shape=disc_interpolates.shape, ctx=ctx), create_graph=True, retain_graph=True)[0] gradients = gradients.reshape((gradients.shape[0], -1)) gradient_penalty = ( (gradients.norm(2, axis=1, keepdims=True) - 1)**2).mean() * LAMBDA return gradient_penalty
def wasser_penalty(dis_model, real, fake, penalty_rate, ctx=None): from mxnet import autograd with autograd.pause(): alpha = mx.nd.random_uniform(shape=real.shape) if ctx: alpha.as_in_context(ctx) interpolates = alpha * real.detach() + ((1 - alpha) * fake.detach()) interpolates = interpolates.detach() interpolates.attach_grad() z = dis_model(interpolates) gradients = autograd.grad(heads=z, variables=interpolates, head_grads=mx.nd.ones(shape=z.shape, ctx=ctx), retain_graph=True, create_graph=True)[0] gradients = gradients.reshape((gradients.shape[0], -1)) gradients_penalty = ( (gradients.norm(2, axis=1) - 1)**2).mean() * penalty_rate gradients_penalty.attach_grad() if ctx: gradients_penalty = gradients_penalty.as_in_context(ctx) return gradients_penalty
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dy = out_grad[0] x = in_data[0] w = in_data[1] b = in_data[2] y = out_data[0] x.attach_grad(), w.attach_grad(), b.attach_grad() with autograd.record(): y[:] = mx.nd.add(mx.nd.dot(x, w.T), b) dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True) y, y_shift_bit = self.int_quantize(y) self.assign(in_grad[0], req[0], dx) self.assign(in_grad[1], req[0], dw) self.assign(in_grad[2], req[0], db) self.assign(in_data[3], req[0], w) self.assign(in_data[4], req[0], b) self.assign(in_data[5], req[0], y_shift_bit)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dgamma = in_grad[1] dbeta = in_grad[2] x = in_data[0] gamma = in_data[1] beta = in_data[2] y = out_data[0] dy = out_grad[0] mean = nd.mean(x, axis=(0, 2, 3)) var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3))) quan_gamma = gamma quan_beta = beta x.attach_grad(), gamma.attach_grad(), beta.attach_grad() with autograd.record(): y = nd.BatchNorm(x, gamma=quan_gamma, beta=quan_beta, moving_mean=mean, moving_var=var, eps=self.eps, momentum=self.momentum, fix_gamma=self.fix_gamma, name=self.name) dx, dgamma, dbeta = autograd.grad(y, [x, quan_gamma, quan_beta], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx) self.assign(in_grad[1], req[0], dgamma) self.assign(in_grad[2], req[0], dbeta)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dw = in_grad[1] x = in_data[0] w = in_data[1] # b = in_data[2] # w = in_data[3] # quan_b = in_data[4] x_shift_bit = in_data[5] dy = out_grad[0] # x, x_shift_bit = self.int_quantize(x) quan_w, w_shift_bit = self.int_quantize(w) y = out_data[0] if self.no_bias: x.attach_grad(), quan_w.attach_grad() with autograd.record(): y[:] = nd.Convolution(data=x, weight=quan_w, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw = autograd.grad(y, [x, quan_w], dy, retain_graph=True) y, y_shift_bit = self.int_quantize(y) # print(y_shift_bit) # y_shift_bit = (x_shift_bit * 0.3 + y_shift_bit * 0.7).floor() self.assign(in_grad[0], req[0], dx / (2**y_shift_bit)) self.assign(in_grad[1], req[0], dw / (2**w_shift_bit)) self.assign(in_data[3], req[0], quan_w) # self.assign(in_data[3], req[0], quan_b) self.assign(in_data[5], req[0], y_shift_bit) else: b = in_data[2] b, b_shift_bit = self.int_quantize(b) x.attach_grad(), w.attach_grad(), b.attach_grad() with autograd.record(): y[:] = nd.Convolution(data=x, weight=w, bias=b, kernel=self.kernel, num_filter=self.num_filter, stride=self.stride, pad=self.pad, no_bias=self.no_bias, workspace=self.workspace, name=self.name) dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx / (2**x_shift_bit)) self.assign(in_grad[1], req[0], dw / (2**w_shift_bit)) self.assign(in_grad[2], req[0], db / (2**b_shift_bit)) self.assign(in_data[2], req[0], quan_w) self.assign(in_data[3], req[0], quan_b) self.assign(in_data[4], req[0], x_shift_bit) self.assign(in_data[5], req[0], y_shift_bit)
def check_nth_order_unary(x, op, grad_ops, orders, rtol=None, atol=None): """Assert n-th order autograd gradient against expected gradient. Multiple order of gradients can be checked by passing list of function computing the particular order gradient and passing the corresponding list of order. Note ---- 1. Orders should always be monotonically increasing. 2. Elements of grads_ops should correspond to elements of orders i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with orders = [1, 3] Parameters ---------- x : mxnet.NDArray Input Array. op : Callable Operation to perform on Input Array. grad_ops : Callable or List of Callable Function to compute and assert gradient of given order. orders : int or List of int Order/s to assert expected and computed gradients. Returns ------- None """ if isinstance(orders, int): orders = [orders] grad_ops = [grad_ops] assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \ "orders should be monotonically increasing" assert len(set(orders)) == len(orders), \ "orders should have unique elements" highest_order = max(orders) x = nd.array(x) x.attach_grad() expected_grads = [grad_op(x) for grad_op in grad_ops] computed_grads = [] head_grads = [] # Perform compute. with autograd.record(): y = op(x) for current_order in range(1, highest_order + 1): head_grad = nd.random.normal(shape=x.shape) y = autograd.grad(heads=y, variables=x, head_grads=head_grad, create_graph=True, retain_graph=True)[0] if current_order in orders: computed_grads.append(y) head_grads.append(head_grad) # Validate all the gradients. for order, grad, computed_grad in \ zip(orders, expected_grads, computed_grads): # Compute expected values. expected_grad = grad.asnumpy() for head_grad in head_grads[:order]: expected_grad *= head_grad.asnumpy() assert_almost_equal(expected_grad, computed_grad.asnumpy(), rtol=rtol, atol=atol)
def get_deriv_autograd(input, act): input.attach_grad() with autograd.record(): output = act(input) return autograd.grad(output, [input], create_graph=True)[0]
def sampling_decoder( self, F, static_feat: Tensor, past_target: Tensor, time_feat: Tensor, scale: Tensor, begin_states: List, ) -> Tensor: """ Computes sample paths by unrolling the LSTM starting with a initial input and state. Parameters ---------- static_feat : Tensor static features. Shape: (batch_size, num_static_features). past_target : Tensor target history. Shape: (batch_size, history_length). time_feat : Tensor time features. Shape: (batch_size, prediction_length, num_time_features). scale : Tensor tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1). begin_states : List list of initial states for the LSTM layers. the shape of each tensor of the list should be (batch_size, num_cells) Returns -------- Tensor A tensor containing sampled paths. Shape: (batch_size, num_sample_paths, prediction_length). """ time_feat.attach_grad() past_target.attach_grad() with autograd.record(): # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism repeated_past_target = past_target.repeat( repeats=self.num_parallel_samples, axis=0) repeated_time_feat = time_feat.repeat( repeats=self.num_parallel_samples, axis=0) repeated_static_feat = static_feat.repeat( repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_scale = scale.repeat(repeats=self.num_parallel_samples, axis=0) repeated_states = [ s.repeat(repeats=self.num_parallel_samples, axis=0) for s in begin_states ] future_samples = [] # for each future time-units we draw new samples for this time-unit and update the state for k in range(self.prediction_length): # (batch_size * num_samples, 1, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=repeated_past_target, sequence_length=self.history_length + k, indices=self.shifted_lags, subsequences_length=1, ) # (batch_size * num_samples, 1, *target_shape, num_lags) lags_scaled = F.broadcast_div( lags, repeated_scale.expand_dims(axis=-1)) # from (batch_size * num_samples, 1, *target_shape, num_lags) # to (batch_size * num_samples, 1, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)), ) # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features) decoder_input = F.concat( input_lags, repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1), repeated_static_feat, dim=-1, ) # output shape: (batch_size * num_samples, 1, num_cells) # state shape: (batch_size * num_samples, num_cells) rnn_outputs, repeated_states = self.rnn.unroll( inputs=decoder_input, length=1, begin_state=repeated_states, layout="NTC", merge_outputs=True, ) distr_args = self.proj_distr_args(rnn_outputs) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=repeated_scale) #gaussian has mu and stddev, student T has mu sigma and nu gradient_mu_feat = autograd.grad(distr.base_distribution.mu, [time_feat], create_graph=True) gradient_sigma_feat = autograd.grad( distr.base_distribution.sigma, [time_feat], create_graph=True) gradient_nu_feat = autograd.grad(distr.base_distribution.nu, [time_feat], create_graph=True) # (batch_size * num_samples, 1, *target_shape) new_samples = distr.sample(dtype=self.dtype) with open('gradients.npy', 'wb') as f: np.save(f, gradient_mu_feat[0].asnumpy()) np.save(f, gradient_nu_feat[0].asnumpy()) np.save(f, gradient_sigma_feat[0].asnumpy()) # (batch_size * num_samples, seq_len, *target_shape) repeated_past_target = F.concat(repeated_past_target, new_samples, dim=1) future_samples.append(new_samples) # (batch_size * num_samples, prediction_length, *target_shape) samples = F.concat(*future_samples, dim=1) # (batch_size, num_samples, prediction_length, *target_shape) return samples.reshape(shape=((-1, self.num_parallel_samples) + (self.prediction_length, ) + self.target_shape))