Exemple #1
0
    def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        z = self._out
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)
        wminus = nd.minimum(0., weight)

        bplus = None
        bminus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)
            bminus = nd.minimum(0., bias)

        alpha = z > 0.
        beta = z < 0.

        a.attach_grad()
        with autograd.record():
            zplus = self._forward(data=a, weight=wplus, bias=bplus)
        cplus, = autograd.grad(zplus,
                               a,
                               head_grads=alpha * R / (zplus + (zplus == 0.)))

        with autograd.record():
            zminus = self._forward(data=a, weight=wminus, bias=bminus)
        cminus, = autograd.grad(zminus,
                                a,
                                head_grads=beta * R / (zminus +
                                                       (zminus == 0.)))

        return a * (cplus - cminus)
Exemple #2
0
    def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        z = self._out
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)
        wminus = nd.minimum(0., weight)

        bplus = None
        bminus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)
            bminus = nd.minimum(0., bias)

        alpha = z > 0.
        beta = z < 0.

        a.attach_grad()
        with autograd.record():
            zplus = self._forward(data=a, weight=wplus, bias=bplus)
        cplus, = autograd.grad(zplus, a, head_grads=alpha*R/(zplus + (zplus == 0.)))

        with autograd.record():
            zminus = self._forward(data=a, weight=wminus, bias=bminus)
        cminus, = autograd.grad(zminus, a, head_grads=beta*R/(zminus + (zminus == 0.)))

        return a*(cplus - cminus)
Exemple #3
0
def test_dense_backward_no_flatten():
    print("2nd order gradient for Fully Connected, flatten=False")
    for x in NDArrayGenerator(5, 3):
        hidden = random.randrange(1, 4)
        net = gluon.nn.Sequential()
        with net.name_scope():
            net.add(gluon.nn.Dense(hidden, flatten=False))
        net.initialize(mxnet.initializer.Constant(.5))
        x.attach_grad()
        with autograd.record():
            y = net.forward(x)
            o_y = arange_shape_like(y)  # head gradient of y
            params = [p.data() for p in net.collect_params().values()]
            w = params[0]
            b = params[1]
            print("Checking y ({}) = x({}) * w^T({}) + b({})".format(
                y.shape, x.shape, w.shape, b.shape))
            x_grad = autograd.grad(heads=y,
                                   variables=x,
                                   head_grads=o_y,
                                   create_graph=True,
                                   retain_graph=True)[0]
            o_x_grad = arange_shape_like(x_grad)
            w_grad_grad = autograd.grad(heads=x_grad,
                                        variables=w,
                                        head_grads=o_x_grad,
                                        create_graph=False)[0]
            w_grad = autograd.grad(heads=y,
                                   variables=w,
                                   head_grads=o_y,
                                   create_graph=True,
                                   retain_graph=True)[0]
            o_w_grad = arange_shape_like(w_grad)
            x_grad_grad = autograd.grad(heads=w_grad,
                                        variables=x,
                                        head_grads=o_w_grad,
                                        create_graph=False)[0]
        # Expected results
        o_y = flatten2d_left(o_y)
        x = flatten2d_left(x)
        o_x_grad = flatten2d_left(o_x_grad)
        o_w_grad = flatten2d_left(o_w_grad)
        w_grad_e = nd.dot(o_y, x, transpose_a=True)
        w_grad_grad_e = nd.dot(o_y, o_x_grad, transpose_a=True)
        x_grad_e = nd.dot(o_y, w)
        x_grad_grad_e = nd.dot(o_y, o_w_grad)
        w_grad_check = same(flatten2d_left(w_grad), flatten2d_left(w_grad_e))
        w_grad_grad_check = same(flatten2d_left(w_grad_grad),
                                 flatten2d_left(w_grad_grad_e))
        x_grad_check = same(flatten2d_left(x_grad), flatten2d_left(x_grad_e))
        x_grad_grad_check = same(flatten2d_left(x_grad_grad),
                                 flatten2d_left(x_grad_grad_e))
        ok_(x_grad_check)
        ok_(w_grad_check)
        ok_(x_grad_grad_check)
        ok_(w_grad_grad_check)
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dx = in_grad[0]
        dw = in_grad[1]
        db = in_grad[2]
        dy = out_grad[0]

        x = in_data[0]
        w = in_data[1]
        b = in_data[2]
        y = out_data[0]

        int_x, x_shift_bit = self.int_quantize(x)
        int_w, int_b, w_shift_bit = self.int_quantize_double(w, b)

        int_x.attach_grad(), int_w.attach_grad(), int_b.attach_grad()

        with autograd.record():
            y[:] = mx.nd.add(mx.nd.dot(int_x, int_w.T), int_b)
        dx, dw, db = autograd.grad(y, [int_x, int_w, int_b],
                                   dy,
                                   retain_graph=True)

        #print('dx_origin:', dx)
        #print('dw_origin:', dw)
        #print('db_origin:', db)
        #print(x_shift_bit, w_shift_bit)
        #print('dx:', dx/(2**x_shift_bit), )
        #print('dw:', dw/(2**w_shift_bit), )
        #print('db:', db/(2**w_shift_bit))
        self.assign(in_grad[0], req[0], dx / (2**x_shift_bit))
        self.assign(in_grad[1], req[0], dw / (2**w_shift_bit))
        self.assign(in_grad[2], req[0], db / (2**w_shift_bit))
def get_gradient(crit, real, fake, epsilon):
    mixed_images = epsilon * real + (1 - epsilon) * fake
    mixed_images.attach_grad()
    with autograd.record():
        mixed_scores = crit(mixed_images)
    grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True)[0]
    return grad
def check_second_order_unary(x, op, grad_grad_op, rtol=None, atol=None):
    x = nd.array(x)
    grad_grad_x = grad_grad_op(x)
    x.attach_grad()

    # Manual head_grads.
    y_grad = nd.random.normal(shape=x.shape)
    head_grad_grads = nd.random.normal(shape=x.shape)

    # Perform compute.
    with autograd.record():
        y = op(x)
        x_grad = autograd.grad(heads=y,
                               variables=x,
                               head_grads=y_grad,
                               create_graph=True,
                               retain_graph=True)[0]
    x_grad.backward(head_grad_grads)

    # Compute expected values.
    expected_grad_grad = grad_grad_x.asnumpy() * head_grad_grads.asnumpy() * \
        y_grad.asnumpy()

    # Validate the gradients.
    assert_almost_equal(expected_grad_grad,
                        x.grad.asnumpy(),
                        rtol=rtol,
                        atol=atol)
Exemple #7
0
def jacobian_autograd(x, y):
    jac = []
    for i in range(y.shape[1]):
        with autograd.record():
            yi = y[:, i]
        dyidx = autograd.grad(yi, [x], create_graph=True)[0]
        jac += [nd.expand_dims(dyidx, 1)]
    return nd.concatenate(jac, 1)
def get_gradient(crit, real, fake, epsilon):
    mixed_images = epsilon * real + (1 - epsilon) * fake
    mixed_images.attach_grad()
    # with autograd.record():
    mixed_scores = crit(mixed_images)
    grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True, create_graph=True,
                         head_grads=nd.ones_like(mixed_scores))[0]
    return grad
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dx = in_grad[0]
        dgamma = in_grad[1]
        dbeta = in_grad[2]

        x = in_data[0]
        gamma = in_data[1]
        beta = in_data[2]
        mean = in_data[3]
        var = in_data[4]
        new_gamma = in_data[5]
        new_beta = in_data[6]
        y_shift_bit = in_data[7]
        last_shift_bit = in_data[8]

        y = out_data[0]
        dy = out_grad[0]

        mean = nd.mean(x, axis=(0, 2, 3))
        var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3)))

        quan_gamma = gamma / (nd.sqrt(var + self.eps))
        quan_beta = beta - mean * gamma / nd.sqrt(var + self.eps)

        # quan_gamma = nd.clip(nd.floor(nd.log2(quan_gamma)), a_min=-3, a_max=0)
        # quan_gamma = 2**(quan_gamma)
        quan_gamma = quan_gamma * (2**last_shift_bit)
        # quan_beta, beta_shift_bit = self.int_quantize(quan_beta)
        quan_gamma, quan_beta, gamma_shift_bit = self.int_quantize_double(
            quan_gamma, quan_beta)
        x.attach_grad(), quan_gamma.attach_grad(), quan_beta.attach_grad()
        # print(quan_gamma)

        with autograd.record():
            y = nd.BatchNorm(x,
                             gamma=quan_gamma,
                             beta=quan_beta,
                             moving_mean=nd.zeros(shape=mean.shape),
                             moving_var=nd.ones(shape=var.shape),
                             eps=self.eps,
                             momentum=self.momentum,
                             fix_gamma=False,
                             name=self.name)
            y, y_shift_bit = self.int_quantize(y)
        # print(quan_gamma)

        dx, dgamma, dbeta = autograd.grad(y, [x, quan_gamma, quan_beta],
                                          dy,
                                          retain_graph=True)

        self.assign(in_grad[0], req[0], dx / 2**y_shift_bit)
        self.assign(in_grad[1], req[0],
                    dgamma / 2**(gamma_shift_bit + last_shift_bit))
        self.assign(in_grad[2], req[0], dbeta / 2**gamma_shift_bit)

        self.assign(in_data[5], req[0], quan_gamma)
        self.assign(in_data[6], req[0], quan_beta)
        self.assign(in_data[7], req[0], y_shift_bit)
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dx = in_grad[0]
        dw = in_grad[1]
        x = in_data[0]
        w = in_data[1]

        dy = out_grad[0]

        x, x_shift_bit = self.int_quantize(x)
        w, w_shift_bit = self.int_quantize(w)
        y = out_data[0]
        if self.no_bias:
            x.attach_grad(), w.attach_grad()
            with autograd.record():
                y[:] = nd.Convolution(data=x,
                                      weight=w,
                                      kernel=self.kernel,
                                      num_filter=self.num_filter,
                                      stride=self.stride,
                                      pad=self.pad,
                                      no_bias=self.no_bias,
                                      workspace=self.workspace,
                                      name=self.name)
            dx, dw = autograd.grad(y, [x, w], dy, retain_graph=True)
            self.assign(in_grad[0], req[0], dx / (2**x_shift_bit))
            self.assign(in_grad[1], req[0], dw / (2**w_shift_bit))
        else:
            b = in_data[2]
            b, b_shift_bit = self.int_quantize(b)
            x.attach_grad(), w.attach_grad(), b.attach_grad()
            with autograd.record():
                y[:] = nd.Convolution(data=x,
                                      weight=w,
                                      bias=b,
                                      kernel=self.kernel,
                                      num_filter=self.num_filter,
                                      stride=self.stride,
                                      pad=self.pad,
                                      no_bias=self.no_bias,
                                      workspace=self.workspace,
                                      name=self.name)
            dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True)
            self.assign(in_grad[0], req[0], dx / (2**x_shift_bit))
            self.assign(in_grad[1], req[0], dw / (2**w_shift_bit))
            self.assign(in_grad[2], req[0], db / (2**b_shift_bit))
def check_second_order_unary(x, op, grad_grad_op):
    x = nd.array(x)
    expect_grad_grad = grad_grad_op(x)
    x.attach_grad()
    with autograd.record():
        y = op(x)
        y_grad = autograd.grad(y, x, create_graph=True, retain_graph=True)[0]
    y_grad.backward()
    assert_almost_equal(expect_grad_grad.asnumpy(), x.grad.asnumpy())
Exemple #12
0
  def compute_gradients(self,
                        elbo: nd.NDArray,
                        data_batch: mx.io.DataBatch = None,
                        log_q_sum: nd.NDArray = None,
                        mode: str = 'train') -> None:
    """Compute gradients and assign them to variational parameters.

    Args:
      elbo: evidence lower bound that we maximize
      data_batch: minibatch of data with data indices as labels
      log_q_sum: sum of log probs of samples from variational distributions q.
    """
    cfg = self.gradient_config
    if cfg['estimator'] == 'pathwise':
      for block in self.sequential._children:
        for child_block in block._children:
          if hasattr(child_block, 'is_reparam'):
            assert child_block.is_reparam == True
    if len(self._point_mass_params) > 0 and mode == 'train':
      variables = [p.data() for p in self._point_mass_params]
      assert elbo.shape[-1] == cfg['batch_size']
      loss = nd.mean(-elbo, -1)
      point_mass_grads = autograd.grad(loss, variables, retain_graph=True)
      _assign_grads(self._point_mass_params, point_mass_grads)
    if cfg['estimator'] == 'pathwise':
        (-elbo).backward()
    elif cfg['estimator'] == 'score_function':
      variables = [param.repeated for param in self._score_params]
      score_functions = autograd.grad(log_q_sum, variables)
      mx.autograd.set_recording(False)
      score_grads = []
      for param, score_function in zip(self._score_params, score_functions):
        grad = _leave_one_out_gradient_estimator(score_function, -elbo)
        if 'emb' in param.name:
          # turns out the sparse implementation is not faster?!
          # data, label = data_batch
          # label = label.astype(np.int64)
          # grad = nd.sparse.row_sparse_array(
          #     grad, indices=label, shape=param.shape)
          # need to broadcast for embeddings
          one_hot = nd.one_hot(data_batch[1], depth=self.n_data)
          grad = nd.dot(one_hot, grad, transpose_a=True)
        score_grads.append(grad)
      _assign_grads(self._score_params, score_grads)
Exemple #13
0
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dy = out_grad[0] * 2**out_data[1]

        x = in_data[0] / 2**in_data[3]
        w = in_data[1]
        b = in_data[2]

        x.attach_grad(), w.attach_grad()
        if self.no_bias:
            with autograd.record():
                y = nd.Convolution(
                    data=x,
                    weight=w,
                    # bias=b_int,
                    kernel=self.kernel,
                    num_filter=self.num_filter,
                    stride=self.stride,
                    pad=self.pad,
                    no_bias=self.no_bias,
                    workspace=self.workspace,
                    name=self.name)
            dx, dw = autograd.grad(y, [x, w], dy, retain_graph=True)
        else:
            with autograd.record():
                y = nd.Convolution(data=x,
                                   weight=w,
                                   bias=b_int,
                                   kernel=self.kernel,
                                   num_filter=self.num_filter,
                                   stride=self.stride,
                                   pad=self.pad,
                                   no_bias=self.no_bias,
                                   workspace=self.workspace,
                                   name=self.name)
            dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True)

        self.assign(in_grad[0], req[0], dx)
        self.assign(in_grad[1], req[1], dw)
        if not self.no_bias:
            self.assign(in_grad[2], req[2], db)
Exemple #14
0
def test_autograd():
    a = nd.ones((1, 1)) * 0.05
    b = nd.ones((1, 1))
    q_a = (a * 256).floor()
    q_b = (b * 256).floor()
    a.attach_grad(), b.attach_grad()
    c = a * b
    c.attach_grad()
    with autograd.record():
        loss = (1 - c)**2 / 2
    q_c = q_a * q_b / 256 / 256
    q_loss = (1 - q_c)**2 / 2

    da, db, dc = autograd.grad(loss, [a, b, c], 1 - q_loss, retain_graph=True)
Exemple #15
0
    def layerwise_relevance_wsquare(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0].ones_like()
        weight = self.weight.data(ctx=a.context)
        wsquare = weight**2
        bsquare = None

        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bsquare = bias**2

        a.attach_grad()
        with autograd.record():
            z = self._forward(data=a, weight=wsquare, bias=bsquare)
        c, = autograd.grad(z, a, head_grads=R / (z + (z == 0.)))
        return c
Exemple #16
0
    def layerwise_relevance_zplus(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)

        bplus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)

        a.attach_grad()
        with autograd.record():
            z = self._forward(data=a, weight=wplus, bias=bplus)
        c, = autograd.grad(z, a, head_grads=R/(z + (z == 0.)))
        return a*c
Exemple #17
0
    def layerwise_relevance_zplus(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0]
        weight = self.weight.data(ctx=a.context)
        wplus = nd.maximum(0., weight)

        bplus = None
        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bplus = nd.maximum(0., bias)

        a.attach_grad()
        with autograd.record():
            z = self._forward(data=a, weight=wplus, bias=bplus)
        c, = autograd.grad(z, a, head_grads=R / (z + (z == 0.)))
        return a * c
Exemple #18
0
    def layerwise_relevance_wsquare(self, out, use_bias=False, **kwargs):
        if self._in is None:
            raise RuntimeError('Block has not yet executed forward_logged!')
        R = out
        a = self._in[0].ones_like()
        weight = self.weight.data(ctx=a.context)
        wsquare = weight**2
        bsquare = None

        if use_bias is not None:
            bias = self.bias.data(ctx=a.context)
            bsquare = bias**2

        a.attach_grad()
        with autograd.record():
            z = self._forward(data=a, weight=wsquare, bias=bsquare)
        c, = autograd.grad(z, a, head_grads=R/(z + (z == 0.)))
        return c
def get_crit_loss(gen, crit, real, batch_size, z_dim, ctx):
    z = nd.random.randn(batch_size, z_dim, 1, 1, ctx=ctx)
    fake = gen(z).detach()
    y_pred_fake = crit(fake).reshape(real.shape[0], -1)
    y_pred_real = crit(real).reshape(real.shape[0], -1)
    epsilon = np.random.rand(len(real), 1, 1, 1)
    epsilon = nd.array(epsilon, ctx=ctx)
    # grad = get_gradient(crit, X, Xhat.detach(), epsilon)

    mixed_images = epsilon * real + (1 - epsilon) * fake
    mixed_images.attach_grad()
    # with autograd.record():
    mixed_scores = crit(mixed_images)
    grad = autograd.grad(mixed_scores, [mixed_images], retain_graph=True, create_graph=True,
                         head_grads=nd.ones_like(mixed_scores))[0]
    gp = gradient_penalty(grad)
    crit_loss = crit_loss_fn(y_pred_fake, y_pred_real, gp, C_LAMBDA)
    return crit_loss
Exemple #20
0
def calc_gradient_penalty(netD, real_data, fake_data, LAMBDA, ctx):
    real_data = real_data.as_in_context(ctx)
    b_s = real_data.shape[0]
    alpha = nd.random.uniform(0, 1, shape=(b_s, 1, 1, 1), ctx=ctx)
    alpha = alpha.broadcast_to(real_data.shape)
    interpolates = alpha * real_data + ((1 - alpha) * fake_data)

    interpolates = nd.array(interpolates)
    interpolates.attach_grad()
    disc_interpolates = netD(interpolates)
    gradients = autograd.grad(heads=disc_interpolates,
                              variables=interpolates,
                              head_grads=nd.ones(shape=disc_interpolates.shape,
                                                 ctx=ctx),
                              create_graph=True,
                              retain_graph=True)[0]

    gradients = gradients.reshape((gradients.shape[0], -1))
    gradient_penalty = (
        (gradients.norm(2, axis=1, keepdims=True) - 1)**2).mean() * LAMBDA
    return gradient_penalty
Exemple #21
0
def wasser_penalty(dis_model, real, fake, penalty_rate, ctx=None):
    from mxnet import autograd
    with autograd.pause():
        alpha = mx.nd.random_uniform(shape=real.shape)
        if ctx:
            alpha.as_in_context(ctx)
        interpolates = alpha * real.detach() + ((1 - alpha) * fake.detach())

    interpolates = interpolates.detach()
    interpolates.attach_grad()
    z = dis_model(interpolates)
    gradients = autograd.grad(heads=z,
                              variables=interpolates,
                              head_grads=mx.nd.ones(shape=z.shape, ctx=ctx),
                              retain_graph=True,
                              create_graph=True)[0]
    gradients = gradients.reshape((gradients.shape[0], -1))
    gradients_penalty = (
        (gradients.norm(2, axis=1) - 1)**2).mean() * penalty_rate
    gradients_penalty.attach_grad()
    if ctx:
        gradients_penalty = gradients_penalty.as_in_context(ctx)
    return gradients_penalty
Exemple #22
0
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dy = out_grad[0]

        x = in_data[0]
        w = in_data[1]
        b = in_data[2]
        y = out_data[0]

        x.attach_grad(), w.attach_grad(), b.attach_grad()

        with autograd.record():
            y[:] = mx.nd.add(mx.nd.dot(x, w.T), b)

        dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True)
        y, y_shift_bit = self.int_quantize(y)

        self.assign(in_grad[0], req[0], dx)
        self.assign(in_grad[1], req[0], dw)
        self.assign(in_grad[2], req[0], db)

        self.assign(in_data[3], req[0], w)
        self.assign(in_data[4], req[0], b)
        self.assign(in_data[5], req[0], y_shift_bit)
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dx = in_grad[0]
        dgamma = in_grad[1]
        dbeta = in_grad[2]

        x = in_data[0]
        gamma = in_data[1]
        beta = in_data[2]

        y = out_data[0]
        dy = out_grad[0]

        mean = nd.mean(x, axis=(0, 2, 3))
        var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3)))

        quan_gamma = gamma
        quan_beta = beta

        x.attach_grad(), gamma.attach_grad(), beta.attach_grad()
        with autograd.record():
            y = nd.BatchNorm(x,
                             gamma=quan_gamma,
                             beta=quan_beta,
                             moving_mean=mean,
                             moving_var=var,
                             eps=self.eps,
                             momentum=self.momentum,
                             fix_gamma=self.fix_gamma,
                             name=self.name)

        dx, dgamma, dbeta = autograd.grad(y, [x, quan_gamma, quan_beta],
                                          dy,
                                          retain_graph=True)
        self.assign(in_grad[0], req[0], dx)
        self.assign(in_grad[1], req[0], dgamma)
        self.assign(in_grad[2], req[0], dbeta)
Exemple #24
0
    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        dx = in_grad[0]
        dw = in_grad[1]
        x = in_data[0]
        w = in_data[1]
        # b = in_data[2]
        # w = in_data[3]
        # quan_b = in_data[4]
        x_shift_bit = in_data[5]
        dy = out_grad[0]

        # x, x_shift_bit = self.int_quantize(x)
        quan_w, w_shift_bit = self.int_quantize(w)
        y = out_data[0]
        if self.no_bias:
            x.attach_grad(), quan_w.attach_grad()
            with autograd.record():
                y[:] = nd.Convolution(data=x,
                                      weight=quan_w,
                                      kernel=self.kernel,
                                      num_filter=self.num_filter,
                                      stride=self.stride,
                                      pad=self.pad,
                                      no_bias=self.no_bias,
                                      workspace=self.workspace,
                                      name=self.name)
            dx, dw = autograd.grad(y, [x, quan_w], dy, retain_graph=True)
            y, y_shift_bit = self.int_quantize(y)
            # print(y_shift_bit)
            # y_shift_bit = (x_shift_bit * 0.3 + y_shift_bit * 0.7).floor()
            self.assign(in_grad[0], req[0], dx / (2**y_shift_bit))
            self.assign(in_grad[1], req[0], dw / (2**w_shift_bit))

            self.assign(in_data[3], req[0], quan_w)
            # self.assign(in_data[3], req[0], quan_b)
            self.assign(in_data[5], req[0], y_shift_bit)

        else:
            b = in_data[2]
            b, b_shift_bit = self.int_quantize(b)
            x.attach_grad(), w.attach_grad(), b.attach_grad()
            with autograd.record():
                y[:] = nd.Convolution(data=x,
                                      weight=w,
                                      bias=b,
                                      kernel=self.kernel,
                                      num_filter=self.num_filter,
                                      stride=self.stride,
                                      pad=self.pad,
                                      no_bias=self.no_bias,
                                      workspace=self.workspace,
                                      name=self.name)
            dx, dw, db = autograd.grad(y, [x, w, b], dy, retain_graph=True)
            self.assign(in_grad[0], req[0], dx / (2**x_shift_bit))
            self.assign(in_grad[1], req[0], dw / (2**w_shift_bit))
            self.assign(in_grad[2], req[0], db / (2**b_shift_bit))

            self.assign(in_data[2], req[0], quan_w)
            self.assign(in_data[3], req[0], quan_b)
            self.assign(in_data[4], req[0], x_shift_bit)
            self.assign(in_data[5], req[0], y_shift_bit)
Exemple #25
0
def check_nth_order_unary(x, op, grad_ops, orders, rtol=None, atol=None):
    """Assert n-th order autograd gradient against expected gradient.

    Multiple order of gradients can be checked by passing list of
    function computing the particular order gradient and passing the
    corresponding list of order.

    Note
    ----
    1. Orders should always be monotonically increasing.
    2. Elements of grads_ops should correspond to elements of orders
    i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with
         orders = [1, 3]

    Parameters
    ----------
    x : mxnet.NDArray
        Input Array.
    op : Callable
        Operation to perform on Input Array.
    grad_ops : Callable or List of Callable
        Function to compute and assert gradient of given order.
    orders : int or List of int
        Order/s to assert expected and computed gradients.

    Returns
    -------
    None

    """
    if isinstance(orders, int):
        orders = [orders]
        grad_ops = [grad_ops]

    assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \
        "orders should be monotonically increasing"
    assert len(set(orders)) == len(orders), \
        "orders should have unique elements"
    highest_order = max(orders)

    x = nd.array(x)
    x.attach_grad()

    expected_grads = [grad_op(x) for grad_op in grad_ops]
    computed_grads = []
    head_grads = []

    # Perform compute.
    with autograd.record():
        y = op(x)
        for current_order in range(1, highest_order + 1):
            head_grad = nd.random.normal(shape=x.shape)
            y = autograd.grad(heads=y,
                              variables=x,
                              head_grads=head_grad,
                              create_graph=True,
                              retain_graph=True)[0]
            if current_order in orders:
                computed_grads.append(y)
            head_grads.append(head_grad)

    # Validate all the gradients.
    for order, grad, computed_grad in \
            zip(orders, expected_grads, computed_grads):
        # Compute expected values.
        expected_grad = grad.asnumpy()
        for head_grad in head_grads[:order]:
            expected_grad *= head_grad.asnumpy()

        assert_almost_equal(expected_grad,
                            computed_grad.asnumpy(),
                            rtol=rtol,
                            atol=atol)
 def get_deriv_autograd(input, act):
     input.attach_grad()
     with autograd.record():
         output = act(input)
     return autograd.grad(output, [input], create_graph=True)[0]
Exemple #27
0
    def sampling_decoder(
        self,
        F,
        static_feat: Tensor,
        past_target: Tensor,
        time_feat: Tensor,
        scale: Tensor,
        begin_states: List,
    ) -> Tensor:
        """
        Computes sample paths by unrolling the LSTM starting with a initial
        input and state.

        Parameters
        ----------
        static_feat : Tensor
            static features. Shape: (batch_size, num_static_features).
        past_target : Tensor
            target history. Shape: (batch_size, history_length).
        time_feat : Tensor
            time features. Shape: (batch_size, prediction_length, num_time_features).
        scale : Tensor
            tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1).
        begin_states : List
            list of initial states for the LSTM layers.
            the shape of each tensor of the list should be (batch_size, num_cells)
        Returns
        --------
        Tensor
            A tensor containing sampled paths.
            Shape: (batch_size, num_sample_paths, prediction_length).
        """
        time_feat.attach_grad()
        past_target.attach_grad()
        with autograd.record():
            # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism
            repeated_past_target = past_target.repeat(
                repeats=self.num_parallel_samples, axis=0)
            repeated_time_feat = time_feat.repeat(
                repeats=self.num_parallel_samples, axis=0)
            repeated_static_feat = static_feat.repeat(
                repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1)
            repeated_scale = scale.repeat(repeats=self.num_parallel_samples,
                                          axis=0)
            repeated_states = [
                s.repeat(repeats=self.num_parallel_samples, axis=0)
                for s in begin_states
            ]

            future_samples = []

            # for each future time-units we draw new samples for this time-unit and update the state
            for k in range(self.prediction_length):
                # (batch_size * num_samples, 1, *target_shape, num_lags)
                lags = self.get_lagged_subsequences(
                    F=F,
                    sequence=repeated_past_target,
                    sequence_length=self.history_length + k,
                    indices=self.shifted_lags,
                    subsequences_length=1,
                )

                # (batch_size * num_samples, 1, *target_shape, num_lags)
                lags_scaled = F.broadcast_div(
                    lags, repeated_scale.expand_dims(axis=-1))

                # from (batch_size * num_samples, 1, *target_shape, num_lags)
                # to (batch_size * num_samples, 1, prod(target_shape) * num_lags)
                input_lags = F.reshape(
                    data=lags_scaled,
                    shape=(-1, 1,
                           prod(self.target_shape) * len(self.lags_seq)),
                )

                # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features)
                decoder_input = F.concat(
                    input_lags,
                    repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1),
                    repeated_static_feat,
                    dim=-1,
                )

                # output shape: (batch_size * num_samples, 1, num_cells)
                # state shape: (batch_size * num_samples, num_cells)
                rnn_outputs, repeated_states = self.rnn.unroll(
                    inputs=decoder_input,
                    length=1,
                    begin_state=repeated_states,
                    layout="NTC",
                    merge_outputs=True,
                )
                distr_args = self.proj_distr_args(rnn_outputs)

                # compute likelihood of target given the predicted parameters
                distr = self.distr_output.distribution(distr_args,
                                                       scale=repeated_scale)
                #gaussian has mu and stddev, student T has mu sigma and nu
                gradient_mu_feat = autograd.grad(distr.base_distribution.mu,
                                                 [time_feat],
                                                 create_graph=True)
                gradient_sigma_feat = autograd.grad(
                    distr.base_distribution.sigma, [time_feat],
                    create_graph=True)
                gradient_nu_feat = autograd.grad(distr.base_distribution.nu,
                                                 [time_feat],
                                                 create_graph=True)
                # (batch_size * num_samples, 1, *target_shape)
                new_samples = distr.sample(dtype=self.dtype)
                with open('gradients.npy', 'wb') as f:
                    np.save(f, gradient_mu_feat[0].asnumpy())
                    np.save(f, gradient_nu_feat[0].asnumpy())
                    np.save(f, gradient_sigma_feat[0].asnumpy())

                # (batch_size * num_samples, seq_len, *target_shape)
                repeated_past_target = F.concat(repeated_past_target,
                                                new_samples,
                                                dim=1)
                future_samples.append(new_samples)

            # (batch_size * num_samples, prediction_length, *target_shape)
            samples = F.concat(*future_samples, dim=1)

        # (batch_size, num_samples, prediction_length, *target_shape)
        return samples.reshape(shape=((-1, self.num_parallel_samples) +
                                      (self.prediction_length, ) +
                                      self.target_shape))