Esempio n. 1
0
def test_non_mean_field_bern_normal_elbo_gradient(enumerate1, pi1, pi2, pi3, include_z=True):
    pyro.clear_param_store()
    num_particles = 10000

    def model():
        with pyro.iarange("particles", num_particles):
            q3 = pyro.param("q3", torch.tensor(pi3, requires_grad=True))
            y = pyro.sample("y", dist.Bernoulli(q3).expand_by([num_particles]))
            if include_z:
                pyro.sample("z", dist.Normal(0.55 * y + q3, 1.0))

    def guide():
        q1 = pyro.param("q1", torch.tensor(pi1, requires_grad=True))
        q2 = pyro.param("q2", torch.tensor(pi2, requires_grad=True))
        with pyro.iarange("particles", num_particles):
            y = pyro.sample("y", dist.Bernoulli(q1).expand_by([num_particles]), infer={"enumerate": enumerate1})
            if include_z:
                pyro.sample("z", dist.Normal(q2 * y + 0.10, 1.0))

    logger.info("Computing gradients using surrogate loss")
    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1]))
    elbo.loss_and_grads(model, guide)
    actual_grad_q1 = pyro.param('q1').grad / num_particles
    if include_z:
        actual_grad_q2 = pyro.param('q2').grad / num_particles
    actual_grad_q3 = pyro.param('q3').grad / num_particles

    logger.info("Computing analytic gradients")
    q1 = torch.tensor(pi1, requires_grad=True)
    q2 = torch.tensor(pi2, requires_grad=True)
    q3 = torch.tensor(pi3, requires_grad=True)
    elbo = kl_divergence(dist.Bernoulli(q1), dist.Bernoulli(q3))
    if include_z:
        elbo = elbo + q1 * kl_divergence(dist.Normal(q2 + 0.10, 1.0), dist.Normal(q3 + 0.55, 1.0))
        elbo = elbo + (1.0 - q1) * kl_divergence(dist.Normal(0.10, 1.0), dist.Normal(q3, 1.0))
        expected_grad_q1, expected_grad_q2, expected_grad_q3 = grad(elbo, [q1, q2, q3])
    else:
        expected_grad_q1, expected_grad_q3 = grad(elbo, [q1, q3])

    prec = 0.04 if enumerate1 is None else 0.02

    assert_equal(actual_grad_q1, expected_grad_q1, prec=prec, msg="".join([
        "\nq1 expected = {}".format(expected_grad_q1.data.cpu().numpy()),
        "\nq1   actual = {}".format(actual_grad_q1.data.cpu().numpy()),
    ]))
    if include_z:
        assert_equal(actual_grad_q2, expected_grad_q2, prec=prec, msg="".join([
            "\nq2 expected = {}".format(expected_grad_q2.data.cpu().numpy()),
            "\nq2   actual = {}".format(actual_grad_q2.data.cpu().numpy()),
        ]))
    assert_equal(actual_grad_q3, expected_grad_q3, prec=prec, msg="".join([
        "\nq3 expected = {}".format(expected_grad_q3.data.cpu().numpy()),
        "\nq3   actual = {}".format(actual_grad_q3.data.cpu().numpy()),
    ]))
Esempio n. 2
0
    def guide():
        p = pyro.param("p", torch.tensor(0.5), constraint=constraints.unit_interval)
        scale = pyro.param("scale", torch.tensor(1.0), constraint=constraints.positive)
        var = pyro.param("var", torch.tensor(1.0), constraint=constraints.positive)

        x = torch.tensor(0., requires_grad=True)
        prior = dist.Normal(0., 10.).log_prob(x)
        likelihood = dist.Normal(x, scale).log_prob(data).sum()
        loss = -(prior + likelihood)
        g = grad(loss, [x], create_graph=True)[0]
        H = grad(g, [x], create_graph=True)[0]
        loc = x.detach() - g / H  # newton step
        pyro.sample("loc", dist.Normal(loc, var))
        pyro.sample("b", dist.Bernoulli(p))
Esempio n. 3
0
def test_elbo_bern(quantity, enumerate1):
    pyro.clear_param_store()
    num_particles = 1 if enumerate1 else 10000
    prec = 0.001 if enumerate1 else 0.1
    q = pyro.param("q", torch.tensor(0.5, requires_grad=True))
    kl = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(0.25))

    def model():
        with pyro.iarange("particles", num_particles):
            pyro.sample("z", dist.Bernoulli(0.25).expand_by([num_particles]))

    @config_enumerate(default=enumerate1)
    def guide():
        q = pyro.param("q")
        with pyro.iarange("particles", num_particles):
            pyro.sample("z", dist.Bernoulli(q).expand_by([num_particles]))

    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1]))

    if quantity == "loss":
        actual = elbo.loss(model, guide) / num_particles
        expected = kl.item()
        assert_equal(actual, expected, prec=prec, msg="".join([
            "\nexpected = {}".format(expected),
            "\n  actual = {}".format(actual),
        ]))
    else:
        elbo.loss_and_grads(model, guide)
        actual = q.grad / num_particles
        expected = grad(kl, [q])[0]
        assert_equal(actual, expected, prec=prec, msg="".join([
            "\nexpected = {}".format(expected.detach().cpu().numpy()),
            "\n  actual = {}".format(actual.detach().cpu().numpy()),
        ]))
Esempio n. 4
0
    def forward(self, real_samples, fake_samples, **critic_kwargs):
        from torch.autograd import grad

        real_samples = real_samples.view(fake_samples.shape)

        subset_size = real_samples.shape[0]

        real_samples = real_samples[:subset_size]
        fake_samples = fake_samples[:subset_size]

        alpha = torch.rand(subset_size)
        if self.use_cuda:
            alpha = alpha.cuda()
        alpha = alpha.view((-1,) + ((1,) * (real_samples.dim() - 1)))

        interpolates = alpha * real_samples + ((1 - alpha) * fake_samples)
        if self.use_cuda:
            interpolates = interpolates.cuda()
        interpolates = Variable(interpolates, requires_grad=True)

        d_output = self.critic(interpolates, **critic_kwargs)

        output = torch.ones(d_output.size())
        if self.use_cuda:
            output = output.cuda()

        gradients = grad(
            outputs=d_output,
            inputs=interpolates,
            grad_outputs=output,
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]
        return ((gradients.norm(2, dim=1) - 1) ** 2).mean() * self.weight
Esempio n. 5
0
def compute_elbo_grad(model, guide, variables):
    x = guide.rsample()
    model_log_prob = model.log_prob(x)
    guide_log_prob, score_function, entropy_term = guide.score_parts(x)
    log_r = model_log_prob - guide_log_prob
    surrogate_elbo = model_log_prob + log_r.detach() * score_function - entropy_term
    return grad(surrogate_elbo.sum(), variables, create_graph=True)
Esempio n. 6
0
def _grad(potential_fn, z):
    z_keys, z_nodes = zip(*z.items())
    for node in z_nodes:
        node.requires_grad = True
    potential_energy = potential_fn(z)
    grads = grad(potential_energy, z_nodes)
    for node in z_nodes:
        node.requires_grad = False
    return dict(zip(z_keys, grads)), potential_energy
Esempio n. 7
0
 def penalty(self, dis, real_data, fake_data):
     probe = self.get_probe(real_data.detach(), fake_data.detach())
     probe.requires_grad = True
     probe_logit, _ = dis(probe)
     gradients = autograd.grad(outputs=F.sigmoid(probe_logit),
                               inputs=probe,
                               grad_outputs=torch.ones_like(probe_logit))[0]
     grad_norm = gradients.view(gradients.shape[0], -1).norm(2, dim=1)
     penalty = ((grad_norm - self.target) ** 2).mean()
     return self.weight * penalty, grad_norm.mean()
Esempio n. 8
0
    def __disc_train_func__(self, target, source, disc_optimizer, running_loss, epoch, batch_num):

        for params in self.disc_model.parameters():
            params.requires_grad = True

        disc_optimizer.zero_grad()

        if isinstance(target, list) or isinstance(target, tuple):
            x = target[0]
        else:
            x = target
        batch_size = x.size(0)

        if self.cuda:
            x = x.cuda()
            source = source.cuda()

        x = Variable(x)
        source = Variable(source)

        real_loss = -torch.mean(self.disc_model(x))
        real_loss.backward()

        generated = self.gen_model(source).detach()

        gen_loss = torch.mean(self.disc_model(generated))
        gen_loss.backward()

        eps = torch.randn(x.size()).uniform_(0,1)

        if self.cuda:
            eps = eps.cuda()

        x__ = Variable(eps * x.data + (1.0 - eps) * generated.data,requires_grad=True)

        pred__ = self.disc_model(x__)

        grad_outputs = torch.ones(pred__.size())

        if self.cuda:
            grad_outputs = grad_outputs.cuda()

        gradients = grad(outputs=pred__,inputs=x__,grad_outputs=grad_outputs,create_graph=True,retain_graph=True,only_inputs=True)[0]

        gradient_penalty = self.lambda_ * ((gradients.view(gradients.size(0),-1).norm(2,1) - 1) ** 2).mean()

        gradient_penalty.backward()

        loss = real_loss + gen_loss + gradient_penalty

        disc_optimizer.step()

        running_loss.add_(loss.cpu() * batch_size)
Esempio n. 9
0
 def grad_norm(self, d_out, x):
     ones = torch.ones(d_out.size())
     if self.use_cuda:
         ones = ones.cuda()
     grad_wrt_x = grad(outputs=d_out, inputs=x,
                       grad_outputs=ones,
                       create_graph=True,
                       retain_graph=True,
                       only_inputs=True)[0]
     g_norm = (grad_wrt_x.view(
         grad_wrt_x.size()[0], -1).norm(2, 1)**2).mean()
     return g_norm    
Esempio n. 10
0
def fgsm(classifier, x, loss_func,attack_params):
    epsilon = attack_params['eps']
    #x_diff = 2 * 0.025 * (to_var(torch.rand(x.size())) - 0.5)
    #x_diff = torch.clamp(x_diff, -epsilon, epsilon)
    x_adv = to_var(x.data)

    c_pre = classifier(x_adv)
    loss = loss_func(c_pre) # gan_loss(c, is_real,compute_penalty=False)
    nx_adv = x_adv + epsilon*torch.sign(grad(loss, x_adv,retain_graph=False)[0])
    x_adv = to_var(nx_adv.data)

    return x_adv
Esempio n. 11
0
def test_elbo_rsvi(enumerate1):
    pyro.clear_param_store()
    num_particles = 40000
    prec = 0.01 if enumerate1 else 0.02
    q = pyro.param("q", torch.tensor(0.5, requires_grad=True))
    a = pyro.param("a", torch.tensor(1.5, requires_grad=True))
    kl1 = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(0.25))
    kl2 = kl_divergence(dist.Gamma(a, 1.0), dist.Gamma(0.5, 1.0))

    def model():
        with pyro.iarange("particles", num_particles):
            pyro.sample("z", dist.Bernoulli(0.25).expand_by([num_particles]))
            pyro.sample("y", dist.Gamma(0.50, 1.0).expand_by([num_particles]))

    @config_enumerate(default=enumerate1)
    def guide():
        q = pyro.param("q")
        a = pyro.param("a")
        with pyro.iarange("particles", num_particles):
            pyro.sample("z", dist.Bernoulli(q).expand_by([num_particles]))
            pyro.sample("y", ShapeAugmentedGamma(a, torch.tensor(1.0)).expand_by([num_particles]))

    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1]))
    elbo.loss_and_grads(model, guide)

    actual_q = q.grad / num_particles
    expected_q = grad(kl1, [q])[0]
    assert_equal(actual_q, expected_q, prec=prec, msg="".join([
        "\nexpected q.grad = {}".format(expected_q.detach().cpu().numpy()),
        "\n  actual q.grad = {}".format(actual_q.detach().cpu().numpy()),
    ]))
    actual_a = a.grad / num_particles
    expected_a = grad(kl2, [a])[0]
    assert_equal(actual_a, expected_a, prec=prec, msg="".join([
        "\nexpected a.grad= {}".format(expected_a.detach().cpu().numpy()),
        "\n  actual a.grad = {}".format(actual_a.detach().cpu().numpy()),
    ]))
Esempio n. 12
0
def test_elbo_iarange_iarange(outer_dim, inner_dim, enumerate1, enumerate2, enumerate3, enumerate4):
    pyro.clear_param_store()
    num_particles = 1 if all([enumerate1, enumerate2, enumerate3, enumerate4]) else 100000
    q = pyro.param("q", torch.tensor(0.75, requires_grad=True))
    p = 0.2693204236205713  # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5

    def model():
        d = dist.Bernoulli(p)
        with pyro.iarange("particles", num_particles):
            context1 = pyro.iarange("outer", outer_dim, dim=-2)
            context2 = pyro.iarange("inner", inner_dim, dim=-3)
            pyro.sample("w", d.expand_by([num_particles]))
            with context1:
                pyro.sample("x", d.expand_by([outer_dim, num_particles]))
            with context2:
                pyro.sample("y", d.expand_by([inner_dim, 1, num_particles]))
            with context1, context2:
                pyro.sample("z", d.expand_by([inner_dim, outer_dim, num_particles]))

    def guide():
        d = dist.Bernoulli(pyro.param("q"))
        with pyro.iarange("particles", num_particles):
            context1 = pyro.iarange("outer", outer_dim, dim=-2)
            context2 = pyro.iarange("inner", inner_dim, dim=-3)
            pyro.sample("w", d.expand_by([num_particles]), infer={"enumerate": enumerate1})
            with context1:
                pyro.sample("x", d.expand_by([outer_dim, num_particles]), infer={"enumerate": enumerate2})
            with context2:
                pyro.sample("y", d.expand_by([inner_dim, 1, num_particles]), infer={"enumerate": enumerate3})
            with context1, context2:
                pyro.sample("z", d.expand_by([inner_dim, outer_dim, num_particles]), infer={"enumerate": enumerate4})

    kl_node = kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p))
    kl = (1 + outer_dim + inner_dim + outer_dim * inner_dim) * kl_node
    expected_loss = kl.item()
    expected_grad = grad(kl, [q])[0]

    elbo = TraceEnum_ELBO(max_iarange_nesting=3,
                          strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3]))
    actual_loss = elbo.loss_and_grads(model, guide) / num_particles
    actual_grad = pyro.param('q').grad / num_particles

    assert_equal(actual_loss, expected_loss, prec=0.1, msg="".join([
        "\nexpected loss = {}".format(expected_loss),
        "\n  actual loss = {}".format(actual_loss),
    ]))
    assert_equal(actual_grad, expected_grad, prec=0.1, msg="".join([
        "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()),
        "\n  actual grad = {}".format(actual_grad.detach().cpu().numpy()),
    ]))
Esempio n. 13
0
def compute_gradient_penalty(D, real_samples, fake_samples):
    """Calculates the gradient penalty loss for WGAN GP"""
    # Random weight term for interpolation between real and fake samples
    alpha = Tensor(np.random.random((real_samples.size(0), 1, 1, 1)))
    # Get random interpolation between real and fake samples
    interpolates = (alpha * real_samples + ((1 - alpha) * fake_samples)).requires_grad_(True)
    d_interpolates = D(interpolates)
    fake = Variable(Tensor(real_samples.shape[0], 1).fill_(1.0), requires_grad=False)
    # Get gradient w.r.t. interpolates
    gradients = autograd.grad(outputs=d_interpolates, inputs=interpolates,
                              grad_outputs=fake, create_graph=True, retain_graph=True,
                              only_inputs=True)[0]
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty
Esempio n. 14
0
def test_rsample(dist):
    if not dist.pyro_dist.has_rsample:
        return
    for idx in range(len(dist.dist_params)):

        # Compute CPU value.
        with tensors_default_to("cpu"):
            params = dist.get_dist_params(idx)
            grad_params = [key for key, val in params.items()
                           if torch.is_tensor(val) and val.dtype in (torch.float32, torch.float64)]
            for key in grad_params:
                val = params[key].clone()
                val.requires_grad = True
                params[key] = val
        try:
            with xfail_if_not_implemented():
                cpu_value = dist.pyro_dist(**params).rsample()
                cpu_grads = grad(cpu_value.sum(), [params[key] for key in grad_params])
        except ValueError as e:
            pytest.xfail('CPU version fails: {}'.format(e))
        assert not cpu_value.is_cuda

        # Compute GPU value.
        with tensors_default_to("cuda"):
            params = dist.get_dist_params(idx)
            for key in grad_params:
                val = params[key].clone()
                val.requires_grad = True
                params[key] = val
        cuda_value = dist.pyro_dist(**params).rsample()
        assert cuda_value.is_cuda
        assert_equal(cpu_value.size(), cuda_value.size())

        cuda_grads = grad(cuda_value.sum(), [params[key] for key in grad_params])
        for cpu_grad, cuda_grad in zip(cpu_grads, cuda_grads):
            assert_equal(cpu_grad.size(), cuda_grad.size())
	def calc_gradient_penalty(self,real_data,fake_data,original_size):
		batch_size = real_data.size(0)
		alpha = torch.rand(batch_size,1)
		alpha = alpha.expand(real_data.size())
		alpha = alpha.to(real_data.device)
		
		interpolates = alpha * real_data + ((1 - alpha) * fake_data)
		interpolates = autograd.Variable(interpolates,requires_grad=True)
		interpolates = interpolates.to(real_data.device)
		
		disc_interpolates = self.discriminator(interpolates.view(original_size)).view(-1)

		gradients = autograd.grad(outputs=disc_interpolates,inputs=interpolates,
			grad_outputs=torch.ones(batch_size).to(real_data.device),
			create_graph=True,retain_graph=True,only_inputs=True)[0]
		gradient_penalty = ((gradients.norm(p=2,dim=1)-1)**2).mean()*self.LAMBDA
		return gradient_penalty
Esempio n. 16
0
def test_elbo_irange_irange(outer_dim, inner_dim, enumerate1, enumerate2, enumerate3):
    pyro.clear_param_store()
    num_particles = 1 if all([enumerate1, enumerate2, enumerate3]) else 50000
    q = pyro.param("q", torch.tensor(0.75, requires_grad=True))
    p = 0.2693204236205713  # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5

    def model():
        with pyro.iarange("particles", num_particles):
            pyro.sample("x", dist.Bernoulli(p).expand_by([num_particles]))
            inner_irange = pyro.irange("inner", outer_dim)
            for i in pyro.irange("outer", inner_dim):
                pyro.sample("y_{}".format(i), dist.Bernoulli(p).expand_by([num_particles]))
                for j in inner_irange:
                    pyro.sample("z_{}_{}".format(i, j), dist.Bernoulli(p).expand_by([num_particles]))

    def guide():
        q = pyro.param("q")
        with pyro.iarange("particles", num_particles):
            pyro.sample("x", dist.Bernoulli(q).expand_by([num_particles]),
                        infer={"enumerate": enumerate1})
            inner_irange = pyro.irange("inner", inner_dim)
            for i in pyro.irange("outer", outer_dim):
                pyro.sample("y_{}".format(i), dist.Bernoulli(q).expand_by([num_particles]),
                            infer={"enumerate": enumerate2})
                for j in inner_irange:
                    pyro.sample("z_{}_{}".format(i, j), dist.Bernoulli(q).expand_by([num_particles]),
                                infer={"enumerate": enumerate3})

    kl = (1 + outer_dim * (1 + inner_dim)) * kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p))
    expected_loss = kl.item()
    expected_grad = grad(kl, [q])[0]

    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3]))
    actual_loss = elbo.loss_and_grads(model, guide) / num_particles
    actual_grad = pyro.param('q').grad / num_particles

    assert_equal(actual_loss, expected_loss, prec=0.1, msg="".join([
        "\nexpected loss = {}".format(expected_loss),
        "\n  actual loss = {}".format(actual_loss),
    ]))
    assert_equal(actual_grad, expected_grad, prec=0.1, msg="".join([
        "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()),
        "\n  actual grad = {}".format(actual_grad.detach().cpu().numpy()),
    ]))
Esempio n. 17
0
	def calc_gradient_penalty(self, netD, real_data, fake_data):
		alpha = torch.rand(1, 1)
		alpha = alpha.expand(real_data.size())
		alpha = alpha.cuda()

		interpolates = alpha * real_data + ((1 - alpha) * fake_data)

		interpolates = interpolates.cuda()
		interpolates = Variable(interpolates, requires_grad=True)
		
		disc_interpolates = netD.forward(interpolates)

		gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
								  grad_outputs=torch.ones(disc_interpolates.size()).cuda(),
								  create_graph=True, retain_graph=True, only_inputs=True)[0]

		gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * self.LAMBDA
		return gradient_penalty
Esempio n. 18
0
def compute_gradient_penalty(D, X):
    """Calculates the gradient penalty loss for DRAGAN"""
    # Random weight term for interpolation
    alpha = Tensor(np.random.random(size=X.shape))

    interpolates =  alpha * X + ((1 - alpha) * (X + 0.5 * X.std() * torch.rand(X.size())))
    interpolates = Variable(interpolates, requires_grad=True)

    d_interpolates = D(interpolates)

    fake = Variable(Tensor(X.shape[0], 1).fill_(1.0), requires_grad=False)

    # Get gradient w.r.t. interpolates
    gradients = autograd.grad(outputs=d_interpolates, inputs=interpolates,
                              grad_outputs=fake, create_graph=True, retain_graph=True,
                              only_inputs=True)[0]

    gradient_penalty = lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty
Esempio n. 19
0
def test_non_mean_field_bern_bern_elbo_gradient(enumerate1, pi1, pi2):
    pyro.clear_param_store()
    num_particles = 1 if enumerate1 else 20000

    def model():
        with pyro.iarange("particles", num_particles):
            y = pyro.sample("y", dist.Bernoulli(0.33).expand_by([num_particles]))
            pyro.sample("z", dist.Bernoulli(0.55 * y + 0.10))

    def guide():
        q1 = pyro.param("q1", torch.tensor(pi1, requires_grad=True))
        q2 = pyro.param("q2", torch.tensor(pi2, requires_grad=True))
        with pyro.iarange("particles", num_particles):
            y = pyro.sample("y", dist.Bernoulli(q1).expand_by([num_particles]))
            pyro.sample("z", dist.Bernoulli(q2 * y + 0.10))

    logger.info("Computing gradients using surrogate loss")
    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1]))
    elbo.loss_and_grads(model, config_enumerate(guide, default=enumerate1))
    actual_grad_q1 = pyro.param('q1').grad / num_particles
    actual_grad_q2 = pyro.param('q2').grad / num_particles

    logger.info("Computing analytic gradients")
    q1 = torch.tensor(pi1, requires_grad=True)
    q2 = torch.tensor(pi2, requires_grad=True)
    elbo = kl_divergence(dist.Bernoulli(q1), dist.Bernoulli(0.33))
    elbo = elbo + q1 * kl_divergence(dist.Bernoulli(q2 + 0.10), dist.Bernoulli(0.65))
    elbo = elbo + (1.0 - q1) * kl_divergence(dist.Bernoulli(0.10), dist.Bernoulli(0.10))
    expected_grad_q1, expected_grad_q2 = grad(elbo, [q1, q2])

    prec = 0.03 if enumerate1 is None else 0.001

    assert_equal(actual_grad_q1, expected_grad_q1, prec=prec, msg="".join([
        "\nq1 expected = {}".format(expected_grad_q1.data.cpu().numpy()),
        "\nq1  actual = {}".format(actual_grad_q1.data.cpu().numpy()),
    ]))
    assert_equal(actual_grad_q2, expected_grad_q2, prec=prec, msg="".join([
        "\nq2 expected = {}".format(expected_grad_q2.data.cpu().numpy()),
        "\nq2   actual = {}".format(actual_grad_q2.data.cpu().numpy()),
    ]))
Esempio n. 20
0
def calc_gradient_penalty(netD, real_data, fake_data):
    alpha = torch.rand(BATCH_SIZE, 1, 1)
    alpha = alpha.expand(real_data.size())
    alpha = alpha.cuda(gpu) if use_cuda else alpha

    interpolates = alpha * real_data + ((1 - alpha) * fake_data)

    if use_cuda:
        interpolates = interpolates.cuda(gpu)
    interpolates = autograd.Variable(interpolates, requires_grad=True)

    disc_interpolates = netD(interpolates)

    # TODO: Make ConvBackward diffentiable
    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).cuda(gpu) if use_cuda else torch.ones(
                                  disc_interpolates.size()),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA
    return gradient_penalty
Esempio n. 21
0
def calc_gradient_penalty(D, real_data, fake_data, iwass_lambda, iwass_target):
    global mixing_factors, grad_outputs
    if mixing_factors is None or real_data.size(0) != mixing_factors.size(0):
        mixing_factors = torch.cuda.FloatTensor(real_data.size(0), 1)
    mixing_factors.uniform_()

    mixed_data = Variable(mul_rowwise(real_data, 1 - mixing_factors) + mul_rowwise(fake_data, mixing_factors), requires_grad=True)
    mixed_scores = D(mixed_data)
    if grad_outputs is None or mixed_scores.size(0) != grad_outputs.size(0):
        grad_outputs = torch.cuda.FloatTensor(mixed_scores.size())
        grad_outputs.fill_(1.)

    gradients = grad(outputs=mixed_scores, inputs=mixed_data,
                     grad_outputs=grad_outputs,
                     create_graph=True, retain_graph=True,
                     only_inputs=True)[0]
    gradients = gradients.view(gradients.size(0), -1)

    gradient_penalty = ((gradients.norm(2, dim=1) - iwass_target) ** 2) * iwass_lambda / (iwass_target ** 2)

    return gradient_penalty
Esempio n. 22
0
    def _gradient_penalty(self, real_samples, fake_samples, kwargs):
        """
        Compute the norm of the gradients for each sample in a batch, and
        penalize anything on either side of unit norm
        """
        import torch
        from torch.autograd import Variable, grad

        real_samples = real_samples.view(fake_samples.shape)

        subset_size = real_samples.shape[0]

        real_samples = real_samples[:subset_size]
        fake_samples = fake_samples[:subset_size]

        alpha = torch.rand(subset_size)
        if self.use_cuda:
            alpha = alpha.cuda()
        alpha = alpha.view((-1,) + ((1,) * (real_samples.dim() - 1)))

        interpolates = alpha * real_samples + ((1 - alpha) * fake_samples)
        interpolates = Variable(interpolates, requires_grad=True)
        if self.use_cuda:
            interpolates = interpolates.cuda()

        d_output = self.critic(interpolates, **kwargs)

        grad_ouputs = torch.ones(d_output.size())
        if self.use_cuda:
            grad_ouputs = grad_ouputs.cuda()

        gradients = grad(
            outputs=d_output,
            inputs=interpolates,
            grad_outputs=grad_ouputs,
            create_graph=True,
            retain_graph=True,
            only_inputs=True)[0]
        return ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10
Esempio n. 23
0
def test_elbo_categoricals(enumerate1, enumerate2, enumerate3, max_iarange_nesting):
    pyro.clear_param_store()
    p1 = torch.tensor([0.6, 0.4])
    p2 = torch.tensor([0.3, 0.3, 0.4])
    p3 = torch.tensor([0.1, 0.2, 0.3, 0.4])
    q1 = pyro.param("q1", torch.tensor([0.4, 0.6], requires_grad=True))
    q2 = pyro.param("q2", torch.tensor([0.4, 0.3, 0.3], requires_grad=True))
    q3 = pyro.param("q3", torch.tensor([0.4, 0.3, 0.2, 0.1], requires_grad=True))

    def model():
        pyro.sample("x1", dist.Categorical(p1))
        pyro.sample("x2", dist.Categorical(p2))
        pyro.sample("x3", dist.Categorical(p3))

    def guide():
        pyro.sample("x1", dist.Categorical(pyro.param("q1")), infer={"enumerate": enumerate1})
        pyro.sample("x2", dist.Categorical(pyro.param("q2")), infer={"enumerate": enumerate2})
        pyro.sample("x3", dist.Categorical(pyro.param("q3")), infer={"enumerate": enumerate3})

    kl = (kl_divergence(dist.Categorical(q1), dist.Categorical(p1)) +
          kl_divergence(dist.Categorical(q2), dist.Categorical(p2)) +
          kl_divergence(dist.Categorical(q3), dist.Categorical(p3)))
    expected_loss = kl.item()
    expected_grads = grad(kl, [q1, q2, q3])

    elbo = TraceEnum_ELBO(max_iarange_nesting=max_iarange_nesting,
                          strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3]))
    actual_loss = elbo.loss_and_grads(model, guide)
    actual_grads = [q1.grad, q2.grad, q3.grad]

    assert_equal(actual_loss, expected_loss, prec=0.001, msg="".join([
        "\nexpected loss = {}".format(expected_loss),
        "\n  actual loss = {}".format(actual_loss),
    ]))
    for actual_grad, expected_grad in zip(actual_grads, expected_grads):
        assert_equal(actual_grad, expected_grad, prec=0.001, msg="".join([
            "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()),
            "\n  actual grad = {}".format(actual_grad.detach().cpu().numpy()),
        ]))
Esempio n. 24
0
def test_svi_enum(Elbo, irange_dim, enumerate1, enumerate2):
    pyro.clear_param_store()
    num_particles = 10
    q = pyro.param("q", torch.tensor(0.75), constraint=constraints.unit_interval)
    p = 0.2693204236205713  # for which kl(Bernoulli(q), Bernoulli(p)) = 0.5

    def model():
        pyro.sample("x", dist.Bernoulli(p))
        for i in pyro.irange("irange", irange_dim):
            pyro.sample("y_{}".format(i), dist.Bernoulli(p))

    def guide():
        q = pyro.param("q")
        pyro.sample("x", dist.Bernoulli(q), infer={"enumerate": enumerate1})
        for i in pyro.irange("irange", irange_dim):
            pyro.sample("y_{}".format(i), dist.Bernoulli(q), infer={"enumerate": enumerate2})

    kl = (1 + irange_dim) * kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p))
    expected_loss = kl.item()
    expected_grad = grad(kl, [q.unconstrained()])[0]

    inner_particles = 2
    outer_particles = num_particles // inner_particles
    elbo = TraceEnum_ELBO(max_iarange_nesting=0,
                          strict_enumeration_warning=any([enumerate1, enumerate2]),
                          num_particles=inner_particles)
    actual_loss = sum(elbo.loss_and_grads(model, guide)
                      for i in range(outer_particles)) / outer_particles
    actual_grad = q.unconstrained().grad / outer_particles

    assert_equal(actual_loss, expected_loss, prec=0.3, msg="".join([
        "\nexpected loss = {}".format(expected_loss),
        "\n  actual loss = {}".format(actual_loss),
    ]))
    assert_equal(actual_grad, expected_grad, prec=0.5, msg="".join([
        "\nexpected grad = {}".format(expected_grad.detach().cpu().numpy()),
        "\n  actual grad = {}".format(actual_grad.detach().cpu().numpy()),
    ]))
Esempio n. 25
0
def test_elbo_berns(enumerate1, enumerate2, enumerate3):
    pyro.clear_param_store()
    num_particles = 1 if all([enumerate1, enumerate2, enumerate3]) else 10000
    prec = 0.001 if all([enumerate1, enumerate2, enumerate3]) else 0.1
    q = pyro.param("q", torch.tensor(0.75, requires_grad=True))

    def model():
        with pyro.iarange("particles", num_particles):
            pyro.sample("x1", dist.Bernoulli(0.1).expand_by([num_particles]))
            pyro.sample("x2", dist.Bernoulli(0.2).expand_by([num_particles]))
            pyro.sample("x3", dist.Bernoulli(0.3).expand_by([num_particles]))

    def guide():
        q = pyro.param("q")
        with pyro.iarange("particles", num_particles):
            pyro.sample("x1", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate1})
            pyro.sample("x2", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate2})
            pyro.sample("x3", dist.Bernoulli(q).expand_by([num_particles]), infer={"enumerate": enumerate3})

    kl = sum(kl_divergence(dist.Bernoulli(q), dist.Bernoulli(p)) for p in [0.1, 0.2, 0.3])
    expected_loss = kl.item()
    expected_grad = grad(kl, [q])[0]

    elbo = TraceEnum_ELBO(max_iarange_nesting=1,
                          strict_enumeration_warning=any([enumerate1, enumerate2, enumerate3]))
    actual_loss = elbo.loss_and_grads(model, guide) / num_particles
    actual_grad = q.grad / num_particles

    assert_equal(actual_loss, expected_loss, prec=prec, msg="".join([
        "\nexpected loss = {}".format(expected_loss),
        "\n  actual loss = {}".format(actual_loss),
    ]))
    assert_equal(actual_grad, expected_grad, prec=prec, msg="".join([
        "\nexpected grads = {}".format(expected_grad.detach().cpu().numpy()),
        "\n  actual grads = {}".format(actual_grad.detach().cpu().numpy()),
    ]))
Esempio n. 26
0
def test_model(cuda,
               data_loader,
               mymodel,
               mymodel_clone,
               val_iter,
               task_lr,
               meta_optimizer,
               zero_shot=False):

    meta_loss_final = 0.0
    accs = 0.0
    mymodel.eval()
    for it in range(val_iter):
        meta_loss = 0.0
        # mymodel.eval()
        class_name, support, support_label, query, query_label = next(
            data_loader)
        if cuda:
            support_label, query_label = support_label.cuda(
            ), query_label.cuda()
        '''First Step'''
        loss_s, right_s, query1, class_name1 = train_one_batch(
            args, class_name, support, support_label, query, query_label,
            mymodel, args.task_lr, it)

        zero_grad(mymodel.parameters())
        grads_fc = autograd.grad(loss_s,
                                 mymodel.fc.parameters(),
                                 retain_graph=True)
        grads_mlp = autograd.grad(loss_s, mymodel.mlp.parameters())
        fast_weights_fc, orderd_params = mymodel.cloned_fc_dict(), OrderedDict(
        )
        fast_weights_mlp = mymodel.cloned_mlp_dict()
        for (key, val), grad in zip(mymodel.fc.named_parameters(), grads_fc):
            fast_weights_fc[key] = orderd_params[
                'fc.' + key] = val - args.task_lr * grad
        for (key, val), grad in zip(mymodel.mlp.named_parameters(), grads_mlp):
            fast_weights_mlp[key] = orderd_params[
                'mlp.' + key] = val - args.task_lr * grad

        name_list = []
        for name in mymodel_clone.state_dict():
            name_list.append(name)

        for name in orderd_params:
            if name in name_list:
                mymodel_clone.state_dict()[name].copy_(orderd_params[name])
        '''second-10th step'''
        for _ in range(10 - 1):
            loss_s, right_s, query1, class_name1 = train_one_batch(
                args, class_name, support, support_label, query, query_label,
                mymodel_clone, args.task_lr, it)

            zero_grad(mymodel_clone.parameters())
            grads_fc = autograd.grad(loss_s,
                                     mymodel_clone.fc.parameters(),
                                     retain_graph=True)
            grads_mlp = autograd.grad(loss_s, mymodel_clone.mlp.parameters())
            fast_weights_fc, orderd_params = mymodel_clone.cloned_fc_dict(
            ), OrderedDict()
            fast_weights_mlp = mymodel_clone.cloned_mlp_dict()
            for (key, val), grad in zip(mymodel_clone.fc.named_parameters(),
                                        grads_fc):
                fast_weights_fc[key] = orderd_params[
                    'fc.' + key] = val - args.task_lr * grad
            for (key, val), grad in zip(mymodel_clone.mlp.named_parameters(),
                                        grads_mlp):
                fast_weights_mlp[key] = orderd_params[
                    'mlp.' + key] = val - args.task_lr * grad

            name_list = []
            for name in mymodel_clone.state_dict():
                name_list.append(name)

            for name in orderd_params:
                if name in name_list:
                    mymodel_clone.state_dict()[name].copy_(orderd_params[name])

        # -----在Query上计算loss和acc-------
        loss_q, right_q = train_q(args, class_name, query, query_label,
                                  mymodel_clone)
        meta_loss = meta_loss + loss_q
        meta_loss_final += loss_q
        accs += right_q

        meta_optimizer.zero_grad()
        meta_loss.backward()

        if (it + 1) % 200 == 0:

            print('step: {0:4} | val_loss:{1:3.6f}, val_accuracy: {2:3.2f}%'.
                  format(it + 1, meta_loss_final / (it + 1),
                         100 * accs / (it + 1)))

        # torch.cuda.empty_cache()

    return accs / val_iter, meta_loss_final / val_iter
Esempio n. 27
0
    torch.eye(W.shape[0], device=device),
    torch.eye(W.shape[1], device=device)
] for W in Ws]
step_size = 0.1
grad_norm_clip_thr = 1e8
TrainLoss, TestLoss = [], []
for epoch in range(10):
    t0 = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):
        loss = train_loss(data.to(device), target.to(device))
        TrainLoss.append(loss.item())
        if batch_idx % 100 == 0:
            print('Epoch: {}; batch: {}; train loss: {}'.format(
                epoch, batch_idx, TrainLoss[-1]))

        grads = grad(loss, Ws, create_graph=True)
        if batch_idx % update_preconditioner_every == 0:
            for num_Qs_update in range(update_preconditioner_times):
                v = [torch.randn(W.shape, device=device) for W in Ws]
                Hv = grad(grads, Ws, grad_outputs=v, retain_graph=True)
                with torch.no_grad():
                    Qs = [
                        psgd.update_precond_kron(q[0], q[1], dw, dg)
                        for (q, dw, dg) in zip(Qs, v, Hv)
                    ]

        with torch.no_grad():
            pre_grads = [
                psgd.precond_grad_kron(q[0], q[1], g)
                for (q, g) in zip(Qs, grads)
            ]
 def backward(self):
     grad(outputs=(self.Y,),
          inputs=(self.X, self.alpha),
          grad_outputs=(self.Y))
Esempio n. 29
0
    def step(self, ob_tot, lp1, lp2):
        grad_x = autograd.grad(lp1,
                               self.max_params,
                               create_graph=True,
                               retain_graph=True)  # can remove create graph
        grad_x_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_x])
        grad_y = autograd.grad(lp2,
                               self.min_params,
                               create_graph=True,
                               retain_graph=True)
        grad_y_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_y])
        tot_grad_y = autograd.grad(ob_tot.mean(),
                                   self.min_params,
                                   create_graph=True,
                                   retain_graph=True)
        tot_grad_y = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_y])

        tot_grad_xy = autograd.grad(tot_grad_y,
                                    self.max_params,
                                    grad_outputs=grad_y_vec,
                                    retain_graph=True)
        hvp_x_vec = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_xy])  #tot_xy

        tot_grad_x = autograd.grad(ob_tot.mean(),
                                   self.max_params,
                                   create_graph=True,
                                   retain_graph=True)
        tot_grad_x = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_x])

        tot_grad_yx = autograd.grad(tot_grad_x,
                                    self.min_params,
                                    grad_outputs=grad_x_vec,
                                    retain_graph=True)
        hvp_y_vec = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_yx])

        p_x = torch.add(grad_x_vec, -self.lr * hvp_x_vec)
        p_y = torch.add(grad_y_vec, self.lr * hvp_y_vec)

        if self.collect_info:
            self.norm_px = torch.norm(p_x, p=2)
            self.norm_py = torch.norm(p_y, p=2)
            self.timer = time.time()
        if self.solve_x:
            cg_y, self.iter_num = conjugate_gradient(
                grad_x=grad_y_vec,
                grad_y=grad_x_vec,
                tot_grad_x=tot_grad_y,
                tot_grad_y=tot_grad_x,
                x_params=self.min_params,
                y_params=self.max_params,
                b=p_y,
                x=self.old_y,
                nsteps=p_y.shape[0],  # // 10000,
                lr=self.lr,
                device=self.device)

            hcg = autograd.grad(tot_grad_y,
                                self.max_params,
                                grad_outputs=cg_y,
                                retain_graph=False)  # yx
            hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg])
            cg_x = torch.add(grad_x_vec, -self.lr * hcg)
            self.old_x = cg_x
        else:
            cg_x, self.iter_num = conjugate_gradient(
                grad_x=grad_x_vec,
                grad_y=grad_y_vec,
                tot_grad_x=tot_grad_x,
                tot_grad_y=tot_grad_y,
                x_params=self.max_params,
                y_params=self.min_params,
                b=p_x,
                x=self.old_x,
                nsteps=p_x.shape[0],  # // 10000,
                lr=self.lr,
                device=self.device)
            hcg = autograd.grad(tot_grad_x,
                                self.min_params,
                                grad_outputs=cg_x,
                                retain_graph=False)  # yx
            hcg = torch.cat([g.contiguous().view(-1, 1) for g in hcg])
            cg_y = torch.add(grad_y_vec, self.lr * hcg)
            self.old_y = cg_y

        if self.collect_info:
            self.timer = time.time() - self.timer

        index = 0
        for p in self.max_params:
            if self.weight_decay != 0:
                p.data.add_(-self.weight_decay * p)
            p.data.add_(self.lr *
                        cg_x[index:index + p.numel()].reshape(p.shape))
            index += p.numel()
        if index != cg_x.numel():
            raise ValueError('CG size mismatch')
        index = 0
        for p in self.min_params:
            if self.weight_decay != 0:
                p.data.add_(-self.weight_decay * p)
            p.data.add_(-self.lr *
                        cg_y[index:index + p.numel()].reshape(p.shape))
            index += p.numel()
        if index != cg_y.numel():
            raise ValueError('CG size mismatch')

        if self.collect_info:
            self.norm_gx = torch.norm(grad_x_vec, p=2)
            self.norm_gy = torch.norm(grad_y_vec, p=2)
            self.norm_cgx = torch.norm(cg_x, p=2)
            self.norm_cgy = torch.norm(cg_y, p=2)
        self.solve_x = False if self.solve_x else True
Esempio n. 30
0
def D_logistic_r1(real_image, Discriminator, gamma=10.0):
    reals = Variable(real_image, requires_grad=True).to(real_image.device)
    real_logit = Discriminator(reals)
    real_grads = grad(torch.sum(real_logit), reals)[0]
    gradient_pen = torch.sum(torch.mul(real_grads, real_grads), dim=[1, 2, 3])
    return gradient_pen * (gamma * 0.5)
Esempio n. 31
0
    def train(self):
        self.train_hist = {}
        self.train_hist['D_loss'] = []
        self.train_hist['G_loss'] = []
        self.train_hist['per_epoch_time'] = []
        self.train_hist['total_time'] = []
        self.train_hist['D_norm'] = []

        f = open("%s/results.txt" % self.log_dir, "w")
        f.write("d_loss,g_loss,d_norm\n")
    
        if self.gpu_mode:
            self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1).cuda()), Variable(torch.zeros(self.batch_size, 1).cuda())
        else:
            self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1)), Variable(torch.zeros(self.batch_size, 1))

        #for iter, ((x1_,_), (x2_,_)) in enumerate(zip(self.data_loader, self.data_loader)):
        #    import pdb
        #    pdb.set_trace()

        self.D.train()
        print('training start!!')
        start_time = time.time()
        for epoch in range(self.epoch):
            self.G.train()
            epoch_start_time = time.time()
            for iter, (x_, _) in enumerate(self.data_loader):
                
                if iter == self.data_loader.dataset.__len__() // self.batch_size:
                    break

                z_ = torch.rand((self.batch_size, self.z_dim))

                if self.gpu_mode:
                    x_, z_ = Variable(x_.cuda(), requires_grad=True), \
                            Variable(z_.cuda())
                else:
                    x_, z_ = Variable(x_, requires_grad=True), \
                            Variable(z_)

                # update D network

                D_real = self.D(x_)
                # compute gradient penalty
                grad_wrt_x = grad(outputs=D_real, inputs=x_,
                                 grad_outputs=torch.ones(D_real.size()).cuda(),
                                 create_graph=True, retain_graph=True, only_inputs=True)[0]
                g_norm  = ((grad_wrt_x.view(grad_wrt_x.size()[0], -1).norm(2, 1) - 1) ** 2).mean()
                self.train_hist['D_norm'].append(g_norm.data.item())

                self.D_optimizer.zero_grad()

                G_ = self.G(z_).detach()
                alpha = float(np.random.random())
                Xz = Variable(alpha*x_.data + (1.-alpha)*G_.data)
                D_Xz = self.D(Xz)
                D_loss = self.BCE_loss(D_Xz, alpha*self.y_real_)
                
                self.train_hist['D_loss'].append(D_loss.data.item())

                D_loss.backward()
                self.D_optimizer.step()

                # update G network
                self.G_optimizer.zero_grad()

                G_ = self.G(z_)
                D_fake = self.D(G_)
                G_loss = self.BCE_loss(D_fake, self.y_real_)
                self.train_hist['G_loss'].append(G_loss.data.item())

                G_loss.backward()
                self.G_optimizer.step()

                if ((iter + 1) % 100) == 0:
                    print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f, D_norm: %.8f" %
                          ((epoch + 1),
                           (iter + 1),
                           self.data_loader.dataset.__len__() // self.batch_size,
                           D_loss.data.item(),
                           G_loss.data.item(),
                           g_norm.data.item()))
                    f.write("%.8f,%.8f,%.8f\n" % (D_loss.data.item(), G_loss.data.item(), g_norm.data.item()))
                    f.flush()

            self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time)
            self.visualize_results((epoch+1))

        self.train_hist['total_time'].append(time.time() - start_time)
        print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']),
              self.epoch, self.train_hist['total_time'][0]))
        print("Training finish!... save training results")

        f.close()

        self.save()
        utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name,
                                 self.epoch)
        utils.loss_plot(self.train_hist, os.path.join(self.save_dir, self.dataset, self.model_name), self.model_name)
Esempio n. 32
0
File: gan.py Progetto: rdevon/cortex
 def _get_gradient(inp, output):
     gradient = autograd.grad(outputs=output, inputs=inp,
                              grad_outputs=torch.ones_like(output),
                              create_graph=True, retain_graph=True,
                              only_inputs=True, allow_unused=True)[0]
     return gradient
Esempio n. 33
0
            Tensor(np.random.normal(0, 1, (imgs.shape[0], opt.latent_dim))))

        # Generate a batch of images
        fake_imgs = generator(z)

        # Real images
        real_validity = discriminator(real_imgs)
        # Fake images
        fake_validity = discriminator(fake_imgs)

        # Compute W-div gradient penalty
        real_grad_out = Variable(Tensor(real_imgs.size(0), 1).fill_(1.0),
                                 requires_grad=False)
        real_grad = autograd.grad(real_validity,
                                  real_imgs,
                                  real_grad_out,
                                  create_graph=True,
                                  retain_graph=True,
                                  only_inputs=True)[0]
        real_grad_norm = real_grad.view(real_grad.size(0),
                                        -1).pow(2).sum(1)**(p / 2)

        fake_grad_out = Variable(Tensor(fake_imgs.size(0), 1).fill_(1.0),
                                 requires_grad=False)
        fake_grad = autograd.grad(fake_validity,
                                  fake_imgs,
                                  fake_grad_out,
                                  create_graph=True,
                                  retain_graph=True,
                                  only_inputs=True)[0]
        fake_grad_norm = fake_grad.view(fake_grad.size(0),
                                        -1).pow(2).sum(1)**(p / 2)
Esempio n. 34
0
def HamiltonianSys(p, q, K):
    H = Hamiltonian(p, q, K)
    Gp, Gq = grad(H, (p, q), create_graph=True)
    return -Gq, Gp
Esempio n. 35
0
from likelihood import likelihood
from hmc import hmc_sampler
import sys
import numpy
import torch
from torch.autograd import Variable,grad
dim = 3
q = Variable(torch.rand(dim),requires_grad=True)
SigInv = Variable(torch.eye(dim),requires_grad=False)
potentialE =  q.dot(SigInv.mv(q*q))
#print(q.data)
g = grad(potentialE,q,create_graph=True)[0]
#print(g)
gsplit = torch.split(g,1,dim=0)
#print(gsplit)
H = Variable(torch.rand(dim,dim))
for i in range(dim):
    H[i,:] = grad(gsplit[i],q,create_graph=True)[0]
#print(H)
dH = Variable(torch.rand(dim,dim,dim))
print(H)
#exit()
#x = Variable(torch.rand(1),requires_grad=True)
#y = 0.5 *x
#o = grad(y,x,create_graph=True)
#print(o)
#oo = grad(Variable(torch.rand(1),requires_grad=True),x)
#o = grad(H[0,0],q)
#print(o)
for i in range(dim):
    for j in range(dim):
# .. math::
#
#    \langle \text{d} c . \delta y , e \rangle  =  \langle g , \delta y \rangle  =  \langle \delta y , \partial c . e \rangle
#
# Backpropagation is all about computing the tensor :math:`g=\partial c . e` efficiently, for arbitrary values of :math:`e`:

# Declare a new tensor of shape (M,3) used as the input of the gradient operator.
# It can be understood as a "gradient with respect to the output c"
# and is thus called "grad_output" in the documentation of PyTorch.
e = torch.rand_like(c)

# Call the gradient op:
start = time.time()

# PyTorch remark : grad(c, y, e) alone outputs a length 1 tuple, hence the need for [0].
g = grad(c, y, e)[0]  # g = [∂_y c].e

print('Time to compute gradient of convolution operation with KeOps: ',
      round(time.time() - start, 5), 's')

####################################################################
# The equivalent code with a "vanilla" pytorch implementation

g_torch = ((p - a.transpose(0, 1))[:, None] **2 * torch.exp(x.transpose(0, 1)[:, :, None] \
        + y.transpose(0, 1)[:, None, :]) * e.transpose(0, 1)[:, :, None] ).sum(dim=1).transpose(0, 1)

# Plot the results next to each other:
for i in range(3):
    plt.subplot(3, 1, i + 1)
    plt.plot(g.detach().cpu().numpy()[:40, i], '-', label='KeOps')
    plt.plot(g_torch.detach().cpu().numpy()[:40, i], '--', label='PyTorch')
Esempio n. 37
0
# encoding=GBK


"""
利用torch.autograd.grad求二阶导
"""


import torch
from torch import autograd

###########################################
# 自动求导
# a,b,c的值分别是1,2,3
# x 的值是 1
###########################################
x = torch.tensor(1)
a = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(2.0, requires_grad=True)
c = torch.tensor(3.0, requires_grad=True)

y = a**2 * x + b * x + c

print('before: ', a.grad, b.grad, c.grad)
grades = autograd.grad(y, [a, b, c])
print('after: ', grades[0], grades[1], grades[2])
Esempio n. 38
0
def train_model(mymodel,
                mymodel_clone,
                args,
                sample_class_weights,
                val_step=200):

    n_way_k_shot = str(args.N) + '-way-' + str(args.K) + '-shot'
    print('Start training ' + n_way_k_shot)

    cuda = torch.cuda.is_available()
    if cuda:
        mymodel = mymodel.cuda()
        mymodel_clone = mymodel_clone.cuda()

    data_loader = {}
    data_loader['train'] = get_dataloader(
        args,
        args.train,
        args.class_name_file,
        args.N,
        args.K,
        args.L,
        args.noise_rate,
        sample_class_weights=sample_class_weights)
    data_loader['val'] = get_dataloader(
        args,
        args.val,
        args.class_name_file,
        args.N,
        args.K,
        args.L,
        args.noise_rate,
        sample_class_weights=sample_class_weights,
        train=False)
    data_loader['test'] = get_dataloader(
        args,
        args.test,
        args.class_name_file,
        args.N,
        args.K,
        args.L,
        args.noise_rate,
        sample_class_weights=sample_class_weights,
        train=False)

    optim_params = [{'params': mymodel.coder.parameters(), 'lr': 5e-5}]
    optim_params.append({
        'params': mymodel.fc.parameters(),
        'lr': args.meta_lr
    })
    optim_params.append({
        'params': mymodel.mlp.parameters(),
        'lr': args.meta_lr
    })
    meta_optimizer = AdamW(optim_params, lr=1)

    # mymodel1_meta_opt = AdamW(mymodel.parameters(), lr=args.meta_lr)
    # mymodel2_task_opt = AdamW(mymodel.parameters(), lr=args.task_lr)

    best_acc, best_step, best_test_acc, best_test_step, best_val_loss, best_changed = 0.0, 0, 0.0, 0, 100.0, False
    iter_loss, iter_right, iter_sample = 0.0, 0.0, 0.0
    count = 0
    count_test = 0

    for it in range(args.Train_iter):
        mymodel.train()
        meta_loss, meta_right = 0.0, 0.0
        # meta_loss = []
        # meta_right = 0.0
        # torch.save(mymodel2.state_dict(), 'model_checkpoint/checkpoint.{}th.tar'.format(it))
        for batch in range(args.B):
            class_name, support, support_label, query, query_label = next(
                data_loader['train'])
            # [N, length], tokens:{[N*K,length]}, [1,N*K], tokens:{[N*L,length]}, [1,N*L]
            if cuda:
                support_label, query_label = support_label.cuda(
                ), query_label.cuda()
            '''First Step'''
            loss_s, right_s, query1, class_name1 = train_one_batch(
                args, class_name, support, support_label, query, query_label,
                mymodel, args.task_lr, it)

            zero_grad(mymodel.parameters())
            grads_fc = autograd.grad(loss_s,
                                     mymodel.fc.parameters(),
                                     retain_graph=True)
            grads_mlp = autograd.grad(loss_s, mymodel.mlp.parameters())
            fast_weights_fc, orderd_params = mymodel.cloned_fc_dict(
            ), OrderedDict()
            fast_weights_mlp = mymodel.cloned_mlp_dict()
            for (key, val), grad in zip(mymodel.fc.named_parameters(),
                                        grads_fc):
                fast_weights_fc[key] = orderd_params[
                    'fc.' + key] = val - args.task_lr * grad
            for (key, val), grad in zip(mymodel.mlp.named_parameters(),
                                        grads_mlp):
                fast_weights_mlp[key] = orderd_params[
                    'mlp.' + key] = val - args.task_lr * grad

            name_list = []
            for name in mymodel_clone.state_dict():
                name_list.append(name)

            for name in orderd_params:
                if name in name_list:
                    mymodel_clone.state_dict()[name].copy_(orderd_params[name])

            for _ in range(5 - 1):
                '''2-5th Step'''
                loss_s, right_s, query1, class_name1 = train_one_batch(
                    args, class_name, support, support_label, query,
                    query_label, mymodel_clone, args.task_lr, it)

                zero_grad(mymodel_clone.parameters())
                grads_fc = autograd.grad(loss_s,
                                         mymodel_clone.fc.parameters(),
                                         retain_graph=True)
                grads_mlp = autograd.grad(loss_s,
                                          mymodel_clone.mlp.parameters())
                fast_weights_fc, orderd_params = mymodel_clone.cloned_fc_dict(
                ), OrderedDict()
                fast_weights_mlp = mymodel_clone.cloned_mlp_dict()
                for (key,
                     val), grad in zip(mymodel_clone.fc.named_parameters(),
                                       grads_fc):
                    fast_weights_fc[key] = orderd_params[
                        'fc.' + key] = val - args.task_lr * grad
                for (key,
                     val), grad in zip(mymodel_clone.mlp.named_parameters(),
                                       grads_mlp):
                    fast_weights_mlp[key] = orderd_params[
                        'mlp.' + key] = val - args.task_lr * grad

                name_list = []
                for name in mymodel_clone.state_dict():
                    name_list.append(name)

                for name in orderd_params:
                    if name in name_list:
                        mymodel_clone.state_dict()[name].copy_(
                            orderd_params[name])

            # -----在Query上计算loss和acc-------
            loss_q, right_q = train_q(args, class_name, query, query_label,
                                      mymodel_clone)
            meta_loss = meta_loss + loss_q
            meta_right = meta_right + right_q

        meta_loss_avg = meta_loss / args.B
        meta_right_avg = meta_right / args.B

        # mymodel2.load_state_dict(torch.load('model_checkpoint/checkpoint.{}th.tar'.format(it)))
        # deep_copy(mymodel1, mymodel2)

        meta_optimizer.zero_grad()
        meta_loss_avg.backward()
        meta_optimizer.step()

        iter_loss += meta_loss_avg
        iter_right += meta_right_avg

        if (it + 1) % val_step == 0:
            print('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%'.format(
                it + 1, iter_loss / val_step, 100 * iter_right / val_step))
            iter_loss, iter_right, iter_sample = 0.0, 0.0, 0.0

            count += 1
            val_acc, val_loss = test_model(cuda, data_loader['val'], mymodel,
                                           mymodel_clone, args.Val_iter,
                                           args.task_lr, meta_optimizer)
            # print('[EVAL] | loss: {0:2.6f}, accuracy: {1:2.2f}%'.format(val_loss, val_acc * 100))
            print('[EVAL] | accuracy: {0:2.2f}%'.format(val_acc * 100))
            if val_acc >= best_acc:
                print('Best checkpoint!')
                count = 0
                count_test += 1
                torch.save(
                    mymodel.state_dict(),
                    'model_checkpoint/checkpoint.{0}th_best_model{1}_way_{2}_shot_Lis25_isNPM_isSW.tar'
                    .format(it + 1, args.N, args.K))
                best_acc, best_step, best_val_loss, best_changed = val_acc, (
                    it + 1), val_loss, True
                if count_test % 5 == 0:
                    test_acc, test_loss = test_model(cuda, data_loader['test'],
                                                     mymodel, mymodel_clone,
                                                     args.Val_iter,
                                                     args.task_lr,
                                                     meta_optimizer)
                    print(
                        '[TEST] | loss: {0:2.6f}, accuracy: {1:2.2f}%'.format(
                            test_loss, test_acc * 100))

        # torch.cuda.empty_cache()
        if count > 20:
            break

    print("\n####################\n")
    print('Finish training model! Best val acc: ' + str(best_acc) +
          ' at step ' + str(best_step))
Esempio n. 39
0
def newton_step_2d(loss, x, trust_radius=None):
    """
    Performs a Newton update step to minimize loss on a batch of 2-dimensional
    variables, optionally regularizing to constrain to a trust region.

    ``loss`` must be twice-differentiable as a function of ``x``. If ``loss``
    is ``2+d``-times differentiable, then the return value of this function is
    ``d``-times differentiable.

    When ``loss`` is interpreted as a negative log probability density, then
    the return value of this function can be used to construct a Laplace
    approximation ``MultivariateNormal(mode,cov)``.

    .. warning:: Take care to detach the result of this function when used in
        an optimization loop. If you forget to detach the result of this
        function during optimization, then backprop will propagate through
        the entire iteration process, and worse will compute two extra
        derivatives for each step.

    Example use inside a loop::

        x = torch.zeros(1000, 2)  # arbitrary initial value
        for step in range(100):
            x = x.detach()          # block gradients through previous steps
            x.requires_grad = True  # ensure loss is differentiable wrt x
            loss = my_loss_function(x)
            x = newton_step_2d(loss, x, trust_radius=1.0)
        # the final x is still differentiable

    :param torch.Tensor loss: A scalar function of ``x`` to be minimized.
    :param torch.Tensor x: A dependent variable with rightmost size of 2.
    :param float trust_radius: An optional trust region trust_radius. The
        updated value ``mode`` of this function will be within
        ``trust_radius`` of the input ``x``.
    :return: A pair ``(mode, cov)`` where ``mode`` is an updated tensor
        of the same shape as the original value ``x``, and ``cov`` is an
        esitmate of the covariance 2x2 matrix with
        ``cov.shape == x.shape[:-1] + (2,2)``.
    :rtype: tuple
    """
    if loss.shape != ():
        raise ValueError('Expected loss to be a scalar, actual shape {}'.format(loss.shape))
    if x.dim() < 1 or x.shape[-1] != 2:
        raise ValueError('Expected x to have rightmost size 2, actual shape {}'.format(x.shape))

    # compute derivatives
    g = grad(loss, [x], create_graph=True)[0]
    H = torch.stack([grad(g[..., 0].sum(), [x], create_graph=True)[0],
                     grad(g[..., 1].sum(), [x], create_graph=True)[0]], -1)
    assert g.shape[-1:] == (2,)
    assert H.shape[-2:] == (2, 2)
    _warn_if_nan(g, 'g')
    _warn_if_nan(H, 'H')

    if trust_radius is not None:
        # regularize to keep update within ball of given trust_radius
        detH = H[..., 0, 0] * H[..., 1, 1] - H[..., 0, 1] * H[..., 1, 0]
        mean_eig = (H[..., 0, 0] + H[..., 1, 1]) / 2
        min_eig = mean_eig - (mean_eig ** 2 - detH).sqrt()
        regularizer = (g.pow(2).sum(-1).sqrt() / trust_radius - min_eig).clamp_(min=1e-8)
        _warn_if_nan(regularizer, 'regularizer')
        H = H + regularizer.unsqueeze(-1).unsqueeze(-1) * H.new([[1.0, 0.0], [0.0, 1.0]])

    # compute newton update
    detH = H[..., 0, 0] * H[..., 1, 1] - H[..., 0, 1] * H[..., 1, 0]
    Hinv = H.new(H.shape)
    Hinv[..., 0, 0] = H[..., 1, 1]
    Hinv[..., 0, 1] = -H[..., 0, 1]
    Hinv[..., 1, 0] = -H[..., 1, 0]
    Hinv[..., 1, 1] = H[..., 0, 0]
    Hinv = Hinv / detH.unsqueeze(-1).unsqueeze(-1)
    _warn_if_nan(Hinv, 'Hinv')

    # apply update
    x_new = x.detach() - (Hinv * g.unsqueeze(-2)).sum(-1)
    assert x_new.shape == x.shape
    return x_new, Hinv
    def train(self):
        self.train_hist = {}
        self.train_hist['D_loss'] = []
        self.train_hist['G_loss'] = []
        self.train_hist['per_epoch_time'] = []
        self.train_hist['total_time'] = []

        if self.gpu_mode:
            self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1).cuda()), Variable(torch.zeros(self.batch_size, 1).cuda())
        else:
            self.y_real_, self.y_fake_ = Variable(torch.ones(self.batch_size, 1)), Variable(torch.zeros(self.batch_size, 1))

        self.D.train()
        print('training start!!')
        start_time = time.time()
        for epoch in range(self.epoch):
            self.G.train()
            epoch_start_time = time.time()
            for iter, (x_, _) in enumerate(self.data_loader):
                if iter == self.data_loader.dataset.__len__() // self.batch_size:
                    break

                z_ = torch.rand((self.batch_size, self.z_dim))

                if self.gpu_mode:
                    x_, z_ = Variable(x_.cuda()), Variable(z_.cuda())
                else:
                    x_, z_ = Variable(x_), Variable(z_)

                # update D network
                self.D_optimizer.zero_grad()

                D_real = self.D(x_)
                D_real_loss = -torch.mean(D_real)

                G_ = self.G(z_)
                D_fake = self.D(G_)
                D_fake_loss = torch.mean(D_fake)

                # gradient penalty
                if self.gpu_mode:
                    alpha = torch.rand(x_.size()).cuda()
                else:
                    alpha = torch.rand(x_.size())

                x_hat = Variable(alpha * x_.data + (1 - alpha) * G_.data, requires_grad=True)

                pred_hat = self.D(x_hat)
                if self.gpu_mode:
                    gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).cuda(),
                                 create_graph=True, retain_graph=True, only_inputs=True)[0]
                else:
                    gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()),
                                     create_graph=True, retain_graph=True, only_inputs=True)[0]

                gradient_penalty = self.lambda_ * ((gradients.view(gradients.size()[0], -1).norm(2, 1) - 1) ** 2).mean()

                D_loss = D_real_loss + D_fake_loss + gradient_penalty

                D_loss.backward()
                self.D_optimizer.step()

                if ((iter+1) % self.n_critic) == 0:
                    # update G network
                    self.G_optimizer.zero_grad()

                    G_ = self.G(z_)
                    D_fake = self.D(G_)
                    G_loss = -torch.mean(D_fake)
                    self.train_hist['G_loss'].append(G_loss.data[0])

                    G_loss.backward()
                    self.G_optimizer.step()

                    self.train_hist['D_loss'].append(D_loss.data[0])

                if ((iter + 1) % 100) == 0:
                    print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f" %
                          ((epoch + 1), (iter + 1), self.data_loader.dataset.__len__() // self.batch_size, D_loss.data[0], G_loss.data[0]))

            self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time)
            self.visualize_results((epoch+1))

        self.train_hist['total_time'].append(time.time() - start_time)
        print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']),
              self.epoch, self.train_hist['total_time'][0]))
        print("Training finish!... save training results")

        self.save()
        utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name,
                                 self.epoch)
        utils.loss_plot(self.train_hist, os.path.join(self.save_dir, self.dataset, self.model_name), self.model_name)
Esempio n. 41
0
 def backwardPass(self, func, create_graph):
     g = autograd.grad(func, self.parameters(), create_graph=create_graph)
     return g
Esempio n. 42
0
def getdV(q, V):
    potentialE = V(q)
    g = grad(potentialE, q, create_graph=True)[0]
    return (g)
Esempio n. 43
0
    def step(self, ob, ob_tot, lp1, lp2):
        self.count += 1
        grad_x = autograd.grad(lp1, self.max_params, retain_graph=True)
        grad_x_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_x])
        grad_y = autograd.grad(lp2, self.min_params, retain_graph=True)
        grad_y_vec = torch.cat([g.contiguous().view(-1, 1) for g in grad_y])

        if self.square_avgx is None and self.square_avgy is None:
            self.square_avgx = torch.zeros(grad_x_vec.size(),
                                           requires_grad=False,
                                           device=self.device)
            self.square_avgy = torch.zeros(grad_y_vec.size(),
                                           requires_grad=False,
                                           device=self.device)
        self.square_avgx.mul_(self.beta2).addcmul_(1 - self.beta2,
                                                   grad_x_vec.data,
                                                   grad_x_vec.data)
        self.square_avgy.mul_(self.beta2).addcmul_(1 - self.beta2,
                                                   grad_y_vec.data,
                                                   grad_y_vec.data)

        # Initialization bias correction
        bias_correction2 = 1 - self.beta2**self.count
        self.v_x = self.square_avgx / bias_correction2
        self.v_y = self.square_avgy / bias_correction2

        lr_x = math.sqrt(
            bias_correction2) * self.lr / self.square_avgx.sqrt().add(self.eps)
        lr_y = math.sqrt(
            bias_correction2) * self.lr / self.square_avgy.sqrt().add(self.eps)

        scaled_grad_x = torch.mul(lr_x, grad_x_vec).detach()  # lr_x * grad_x
        scaled_grad_y = torch.mul(lr_y, grad_y_vec).detach()  # lr_y * grad_y

        tot_grad_y = autograd.grad(ob_tot.mean(),
                                   self.min_params,
                                   create_graph=True,
                                   retain_graph=True)
        tot_grad_y = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_y])

        tot_grad_xy = autograd.grad(tot_grad_y,
                                    self.max_params,
                                    grad_outputs=scaled_grad_y,
                                    retain_graph=True)
        hvp_x_vec = torch.cat([
            g.contiguous().view(-1, 1) for g in tot_grad_xy
        ])  # D_xy * lr_y * grad_y

        tot_grad_x = autograd.grad(ob_tot.mean(),
                                   self.max_params,
                                   create_graph=True,
                                   retain_graph=True)
        tot_grad_x = torch.cat(
            [g.contiguous().view(-1, 1) for g in tot_grad_x])

        tot_grad_yx = autograd.grad(tot_grad_x,
                                    self.min_params,
                                    grad_outputs=scaled_grad_x,
                                    retain_graph=True)
        hvp_y_vec = torch.cat([
            g.contiguous().view(-1, 1) for g in tot_grad_yx
        ])  # D_yx * lr_x * grad_x)

        p_x = torch.add(grad_x_vec,
                        -hvp_x_vec).detach_()  # grad_x - D_xy * lr_y * grad_y
        p_y = torch.add(grad_y_vec,
                        hvp_y_vec).detach_()  # grad_y + D_yx * lr_x * grad_x

        if self.collect_info:
            self.norm_px = torch.norm(p_x, p=2)
            self.norm_py = torch.norm(p_y, p=2)
            self.timer = time.time()

        if self.solve_x:
            p_y.mul_(lr_y.sqrt())
            cg_y, self.iter_num = general_conjugate_gradient(
                grad_x=grad_y_vec,
                grad_y=grad_x_vec,
                tot_grad_x=tot_grad_y,
                tot_grad_y=tot_grad_x,
                x_params=self.min_params,
                y_params=self.max_params,
                b=p_y,
                x=self.old_y,
                nsteps=p_y.shape[0],  # // 10000,
                lr_x=lr_y,
                lr_y=lr_x,
                device=self.device)
            #hcg = Hvp_vec(grad_y_vec, self.max_params, cg_y)
            cg_y.detach_().mul_(-lr_y.sqrt())
            hcg = autograd.grad(tot_grad_y,
                                self.max_params,
                                grad_outputs=cg_y,
                                retain_graph=False)  # yx
            hcg = torch.cat([g.contiguous().view(-1, 1)
                             for g in hcg]).add_(grad_x_vec).detach_()
            # grad_x + D_xy * delta y
            cg_x = hcg.mul(lr_x)  # this is basically deltax
            # torch.add(grad_x_vec, - self.lr * hcg)
            self.old_x = hcg.mul(lr_x.sqrt())
        else:
            p_x.mul_(lr_x.sqrt())
            cg_x, self.iter_num = general_conjugate_gradient(
                grad_x=grad_x_vec,
                grad_y=grad_y_vec,
                tot_grad_x=tot_grad_x,
                tot_grad_y=tot_grad_y,
                x_params=self.max_params,
                y_params=self.min_params,
                b=p_x,
                x=self.old_x,
                nsteps=p_x.shape[0],  # // 10000,
                lr_x=lr_x,
                lr_y=lr_y,
                device=self.device)
            # cg_x.detach_().mul_(p_x_norm)
            cg_x.detach_().mul_(lr_x.sqrt())  # delta x = lr_x.sqrt() * cg_x
            hcg = autograd.grad(tot_grad_x,
                                self.min_params,
                                grad_outputs=cg_x,
                                retain_graph=False)  # yx
            hcg = torch.cat([g.contiguous().view(-1, 1)
                             for g in hcg]).add_(grad_y_vec).detach_()
            # grad_y + D_yx * delta x
            cg_y = hcg.mul(-lr_y)
            # cg_y = torch.add(grad_y_vec, self.lr * hcg)
            self.old_y = hcg.mul(lr_y.sqrt())

        if self.collect_info:
            self.timer = time.time() - self.timer

        index = 0
        for p in self.max_params:
            if self.weight_decay != 0:
                p.data.add_(-self.weight_decay * p)
            p.data.add_(cg_x[index:index + p.numel()].reshape(p.shape))
            index += p.numel()
        if index != cg_x.numel():
            raise ValueError('CG size mismatch')
        index = 0
        for p in self.min_params:
            if self.weight_decay != 0:
                p.data.add_(-self.weight_decay * p)
            p.data.add_(cg_y[index:index + p.numel()].reshape(p.shape))
            index += p.numel()
        if index != cg_y.numel():
            raise ValueError('CG size mismatch')

        if self.collect_info:
            self.norm_gx = torch.norm(grad_x_vec, p=2)
            self.norm_gy = torch.norm(grad_y_vec, p=2)
            self.norm_cgx = torch.norm(cg_x, p=2)
            self.norm_cgy = torch.norm(cg_y, p=2)
            self.norm_cgx_cal = torch.norm(self.square_avgx, p=2)
            self.norm_cgy_cal = torch.norm(self.square_avgy, p=2)
            self.norm_vx = torch.norm(self.v_x, p=2)
            self.norm_vy = torch.norm(self.v_y, p=2)
            self.norm_mx = lr_x.max()
            self.norm_my = lr_y.max()
        self.solve_x = False if self.solve_x else True
Esempio n. 44
0
 def lamfun(qdot_):
     KE = self.kinetic_energy(q, qdot_)  # (*, 1)
     JKEq = grad(KE.sum(), [qdot_],
                 create_graph=True)[0]  # (*, qdim)
     return JKEq.sum(0)
Esempio n. 45
0
for index, batch in enumerate(dataloader):
    total_step += 1

    batch['img'] = batch['img'].to(device)
    batch['t'] = batch['t'].to(device)
    batch['v_0'] = batch['v_0'].to(device)
    batch['xy'] = batch['xy'].to(device)
    batch['vxy'] = batch['vxy'].to(device)
    batch['axy'] = batch['axy'].to(device)
    batch['img'].requires_grad = True
    batch['t'].requires_grad = True

    output = model(batch['img'], batch['t'], batch['v_0'])

    vx = grad(output[:, 0].sum(), batch['t'],
              create_graph=True)[0] * (opt.max_dist / opt.max_t)
    vy = grad(output[:, 1].sum(), batch['t'],
              create_graph=True)[0] * (opt.max_dist / opt.max_t)
    output_vxy = torch.cat([vx, vy], dim=1)
    real_v = torch.norm(batch['vxy'], dim=1)

    ax = grad(vx.sum(), batch['t'], create_graph=True)[0]
    ay = grad(vy.sum(), batch['t'], create_graph=True)[0]
    output_axy = (1. / opt.max_t) * torch.cat([ax, ay], dim=1)

    optimizer.zero_grad()
    loss_xy = criterion(output, batch['xy'])
    loss_vxy = criterion(output_vxy, batch['vxy'])
    loss_axy = criterion(output_axy, batch['axy'])

    loss = loss_xy + opt.gamma * loss_vxy + opt.gamma2 * loss_axy
Esempio n. 46
0
    torch.eye(W.shape[1], device=device)
] for W in Ws]
step_size = 0.01
num_epochs = 64
grad_norm_clip_thr = 0.1 * sum(W.shape[0] * W.shape[1] for W in Ws)**0.5
TrainLoss, TestLossApprox, TestLossExact = [], [], []
for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        #new_size = random.randint(28, 36)#random height rescaling
        #data = data[:,:,(torch.arange(new_size)*(32-1)/(new_size-1)).long()]
        #new_size = random.randint(28, 36)#random width rescaling
        #data = data[:,:,:,(torch.arange(new_size)*(32-1)/(new_size-1)).long()]

        loss = train_loss(data.to(device), target.to(device))

        grads = grad(loss, Ws, create_graph=True)
        TrainLoss.append(loss.item())
        if batch_idx % 100 == 0:
            print('Epoch: {}; batch: {}; train loss: {}'.format(
                epoch, batch_idx, TrainLoss[-1]))

        v = [torch.randn(W.shape, device=device) for W in Ws]
        Hv = grad(grads, Ws, v)  #just let Hv=grads if using whitened gradients
        with torch.no_grad():
            Qs = [
                psgd.update_precond_kron(q[0], q[1], dw, dg)
                for (q, dw, dg) in zip(Qs, v, Hv)
            ]
            pre_grads = [
                psgd.precond_grad_kron(q[0], q[1], g)
                for (q, g) in zip(Qs, grads)
 def _matvec_grad(self, img, vec):
     w = torch.zeros(self.hidden_dim, requires_grad=True).to(self.device)
     matvec_transposed = self._matvec_T_grad(img, w)
     dotproduct = torch.matmul(matvec_transposed.flatten(), vec.flatten())
     return autograd.grad(dotproduct, w)[0]
#
# such that for all variation :math:`\delta y` of :math:`y` we have:
#
# .. math::
#
#    \langle \text{d} c . \delta y , e \rangle  =  \langle g , \delta y \rangle  =  \langle \delta y , \partial c . e \rangle
#
# Backpropagation is all about computing the tensor :math:`g=\partial c . e` efficiently, for arbitrary values of :math:`e`:

# Declare a new tensor of shape (M,1) used as the input of the gradient operator.
# It can be understood as a "gradient with respect to the output c"
# and is thus called "grad_output" in the documentation of PyTorch.
e = torch.rand_like(c)
# Call the gradient op:
start = time.time()
g = grad(c, y, e)[0]
# PyTorch remark : grad(c, y, e) alone outputs a length 1 tuple, hence the need for [0] at the end.

print('Time to compute gradient of convolution operation on the cpu: ',
      round(time.time() - start, 5),
      's',
      end=' ')

####################################################################
# We compare with gradient of Log of Sum of Exp:

g2 = grad(c2, y, e)[0]
print('(relative error: ', ((g2 - g).norm() / g.norm()).item(), ')')

# Plot the results next to each other:
plt.plot(g.detach().cpu().numpy()[:40], '-', label='KeOps - Stable')
Esempio n. 49
0
    def refine_mesh(self,
                    mesh,
                    occ_hat,
                    z,
                    c=None,
                    world_mat=None,
                    camera_mat=None):
        ''' Refines the predicted mesh.

        Args:   
            mesh (trimesh object): predicted mesh
            occ_hat (tensor): predicted occupancy grid
            z (tensor): latent code z
            c (tensor): latent conditioned code c
        '''

        self.model.eval()

        # Some shorthands
        n_x, n_y, n_z = occ_hat.shape
        assert (n_x == n_y == n_z)
        # threshold = np.log(self.threshold) - np.log(1. - self.threshold)
        threshold = self.threshold

        # Vertex parameter
        v0 = torch.FloatTensor(mesh.vertices).to(self.device)
        v = torch.nn.Parameter(v0.clone())

        # Faces of mesh
        faces = torch.LongTensor(mesh.faces).to(self.device)

        # Start optimization
        optimizer = optim.RMSprop([v], lr=1e-4)

        for it_r in trange(self.refinement_step):
            optimizer.zero_grad()

            # Loss
            face_vertex = v[faces]
            eps = np.random.dirichlet((0.5, 0.5, 0.5), size=faces.shape[0])
            eps = torch.FloatTensor(eps).to(self.device)
            face_point = (face_vertex * eps[:, :, None]).sum(dim=1)

            face_v1 = face_vertex[:, 1, :] - face_vertex[:, 0, :]
            face_v2 = face_vertex[:, 2, :] - face_vertex[:, 1, :]
            face_normal = torch.cross(face_v1, face_v2)
            face_normal = face_normal / \
                (face_normal.norm(dim=1, keepdim=True) + 1e-10)
            vc = self.model.gproj(face_point.unsqueeze(0), c, world_mat,
                                  camera_mat)
            face_value = torch.sigmoid(
                self.model.decode(face_point.unsqueeze(0), z, vc).logits)
            normal_target = -autograd.grad([face_value.sum()], [face_point],
                                           create_graph=True)[0]

            normal_target = \
                normal_target / \
                (normal_target.norm(dim=1, keepdim=True) + 1e-10)
            loss_target = (face_value - threshold).pow(2).mean()
            loss_normal = \
                (face_normal - normal_target).pow(2).sum(dim=1).mean()

            loss = loss_target + 0.01 * loss_normal

            # Update
            loss.backward()
            optimizer.step()

        mesh.vertices = v.data.cpu().numpy()

        return mesh
Esempio n. 50
0
def eval_metric(step):
    model.eval()
    batch = next(eval_samples)
    mask = [2, 5, 8, 10, 13, 17, 20, 23, 26, 29]
    batch['ts_list'] = batch['ts_list'][:, mask]
    batch['x_list'] = batch['x_list'][:, mask]
    batch['y_list'] = batch['y_list'][:, mask]
    batch['vx_list'] = batch['vx_list'][:, mask]
    batch['vy_list'] = batch['vy_list'][:, mask]

    t = batch['ts_list'].flatten().unsqueeze(1).to(device)
    t.requires_grad = True

    batch['img'] = batch['img'].expand(len(t), 10, 1, opt.height, opt.width)
    batch['img'] = batch['img'].to(device)
    batch['v_0'] = batch['v_0'].expand(len(t), 1)
    batch['v_0'] = batch['v_0'].to(device)
    batch['xy'] = batch['xy'].to(device)
    batch['vxy'] = batch['vxy'].to(device)
    batch['img'].requires_grad = True

    output = model(batch['img'], t, batch['v_0'])
    vx = grad(output[:, 0].sum(), t,
              create_graph=True)[0][:, 0] * (opt.max_dist / opt.max_t)
    vy = grad(output[:, 1].sum(), t,
              create_graph=True)[0][:, 0] * (opt.max_dist / opt.max_t)

    x = output[:, 0] * opt.max_dist
    y = output[:, 1] * opt.max_dist

    ax = grad(vx.sum(), t, create_graph=True)[0] * (1. / opt.max_t)
    ay = grad(vy.sum(), t, create_graph=True)[0] * (1. / opt.max_t)

    jx = grad(ax.sum(), t, create_graph=True)[0] * (1. / opt.max_t)
    jy = grad(ay.sum(), t, create_graph=True)[0] * (1. / opt.max_t)

    vx = vx.data.cpu().numpy()
    vy = vy.data.cpu().numpy()
    x = x.data.cpu().numpy()
    y = y.data.cpu().numpy()

    real_x = batch['x_list'].data.cpu().numpy()[0]
    real_y = batch['y_list'].data.cpu().numpy()[0]
    real_vx = batch['vx_list'].data.cpu().numpy()[0]
    real_vy = batch['vy_list'].data.cpu().numpy()[0]
    ts_list = batch['ts_list'].data.cpu().numpy()[0]

    ex = np.mean(np.abs(x - real_x))
    ey = np.mean(np.abs(y - real_y))
    evx = np.mean(np.abs(vx - real_vx))
    evy = np.mean(np.abs(vy - real_vy))
    fde = np.hypot(x - real_x, y - real_y)[-1]
    ade = np.mean(np.hypot(x - real_x, y - real_y))
    ev = np.mean(np.hypot(vx - real_vx, vy - real_vy))

    jx = jx.data.cpu().numpy()
    jy = jy.data.cpu().numpy()

    smoothness = np.mean(np.hypot(jx, jy))

    logger.add_scalar('metric/ex', ex, step)
    logger.add_scalar('metric/ey', ey, step)
    logger.add_scalar('metric/evx', evx, step)
    logger.add_scalar('metric/evy', evy, step)
    logger.add_scalar('metric/fde', fde, step)
    logger.add_scalar('metric/ade', ade, step)
    logger.add_scalar('metric/ev', ev, step)
    logger.add_scalar('metric/smoothness', smoothness, step)
    model.train()
Esempio n. 51
0
def train(epoch, learning_rate, result_path):
    epoch_loss = 0
    epoch_loss1 = 0
    epoch_loss2 = 0
    Dloss = 0
    regloss = 0
    begin = time.time()

    optimizer_g = optim.Adam([{
        'params': first_frame_net.parameters()
    }, {
        'params': rnn1.parameters()
    }, {
        'params': rnn2.parameters()
    }],
                             lr=learning_rate)
    optimizer_d = optim.Adam(D.parameters(), lr=learning_rate)
    if __name__ == '__main__':
        for iteration, batch in enumerate(train_data_loader):
            gt, meas = Variable(batch[0]), Variable(batch[1])
            gt = gt.cuda()  # [batch,8,256,256]
            gt = gt.float()
            meas = meas.cuda()  # [batch,256 256]
            meas = meas.float()

            mini_batch = gt.size()[0]
            y_real_ = torch.ones(mini_batch).cuda()
            y_fake_ = torch.zeros(mini_batch).cuda()

            meas_re = torch.div(meas, mask_s)
            meas_re = torch.unsqueeze(meas_re, 1)

            optimizer_d.zero_grad()

            batch_size1 = gt.shape[0]

            h0 = torch.zeros(batch_size1, 20, 256, 256).cuda()

            xt1 = first_frame_net(mask, meas_re, block_size, compress_rate)
            model_out1, h1 = rnn1(xt1, meas, mask, h0, meas_re, block_size,
                                  compress_rate)
            model_out = rnn2(model_out1, meas, mask, h1, meas_re, block_size,
                             compress_rate)

            # discriminator training
            toggle_grad(first_frame_net, False)
            toggle_grad(rnn1, False)
            toggle_grad(rnn2, False)
            toggle_grad(D, True)
            gt.requires_grad_()

            D_result = D(gt, y_real_)
            # assert (D_result > 0.0 & D_result < 1.0).all()
            D_real_loss = compute_loss(D_result, 1)
            Dloss += D_result.data.mean()
            D_real_loss.backward(retain_graph=True)

            # model_out.requires_grad_()
            # d_fake = D(model_out, y_real_)
            # dloss_fake = compute_loss(d_fake, 0)

            batch_size = gt.size(0)
            grad_dout = autograd.grad(outputs=D_result.sum(),
                                      inputs=gt,
                                      create_graph=True,
                                      retain_graph=True,
                                      only_inputs=True)[0]
            grad_dout2 = grad_dout.pow(2)
            assert (grad_dout2.size() == gt.size())
            reg1 = grad_dout2.view(batch_size, -1).sum(1)

            reg = 10 * reg1.mean()

            regloss += reg.data.mean()

            reg.backward(retain_graph=True)

            optimizer_d.step()

            # generator training
            toggle_grad(first_frame_net, True)
            toggle_grad(rnn1, True)
            toggle_grad(rnn2, True)
            toggle_grad(D, False)
            optimizer_g.zero_grad()

            D_result = D(model_out, y_real_)
            G_train_loss = compute_loss(D_result, 1)
            Loss1 = loss(model_out1, gt)
            Loss2 = loss(model_out, gt)
            Loss = 0.5 * Loss1 + 0.5 * Loss2 + 0.001 * G_train_loss

            epoch_loss += Loss.data
            epoch_loss1 += Loss1.data
            epoch_loss2 += Loss2.data

            Loss.backward()
            optimizer_g.step()

        test(test_path1, epoch, result_path)

    end = time.time()
    print(
        "===> Epoch {} Complete: Avg. Loss: {:.7f}".format(
            epoch, epoch_loss / len(train_data_loader)),
        "loss1 {:.7f} loss2: {:.7f}".format(
            epoch_loss1 / len(train_data_loader),
            epoch_loss2 / len(train_data_loader)),
        "d loss: {:.7f},reg loss: {:.7f}".format(
            Dloss / len(train_data_loader), regloss / len(train_data_loader)),
        "  time: {:.2f}".format(end - begin))
 def _matvec_T_grad(self, img, vec):
     img.requires_grad = True
     layer_output = self.mfe.extract_layer_output(img)
     dotproduct = torch.matmul(layer_output.flatten(), vec.flatten())
     return autograd.grad(dotproduct, img, create_graph=True)[0]
Esempio n. 53
0
                          opt.batch_size
            else:
                entropy = 0.0
            costs = costs.gather(2, Variable(real.unsqueeze(2))).squeeze(2)
            E_real = costs.sum() / opt.batch_size
            if train_disc:
                loss = (opt.real_multiplier * E_real) - (opt.disc_entropy_reg *
                                                         entropy)
                loss.backward()

            if train_disc and opt.gradient_penalty > 0:
                disc.gradient_penalize = True
                costs, inputs = disc((real, generated))
                costs = costs * inputs[:, 1:]
                loss = ((opt.real_multiplier + 1) / 2) * costs.sum()
                inputs_grad, = autograd.grad([loss], [inputs],
                                             create_graph=True)
                inputs_grad = inputs_grad.view(opt.batch_size, -1)
                norm_sq = (inputs_grad**2).sum(1)
                norm_errors = norm_sq - 2 * torch.sqrt(norm_sq) + 1
                loss = opt.gradient_penalty * norm_errors.sum(
                ) / opt.batch_size
                loss.backward()
                disc.gradient_penalize = False

            disc_gnorms.append(util.gradient_norm(disc.parameters()))
            if train_disc:
                if opt.max_grad_norm > 0:
                    nn.utils.clip_grad_norm(disc.parameters(),
                                            opt.max_grad_norm)
                disc_optimizer.step()
            Wdist = (E_generated - E_real).data[0]
Esempio n. 54
0
def compute_dual_gap(objective, player):
    strategy = Parameter(player())
    loss = objective(strategy)
    grad = autograd.grad(loss, (strategy,))[0]
    err = torch.sum(grad * strategy) - torch.min(grad)
    return loss.item(), err.item()
Esempio n. 55
0
def d_r1_loss(real_pred, real_img):
	grad_real, = autograd.grad( \
		outputs = real_pred.sum(), inputs = real_img, create_graph = True)
	grad_penalty = (grad_real * grad_real).reshape(grad_real.shape[0], -1).sum(1).mean()
	return grad_penalty
Esempio n. 56
0
  def train(self):
    iteration = -1
    label = Variable(torch.FloatTensor(batch_size, 1.0)).to(device)
    logging.info('Current epoch: {}. Max epoch: {}.'.format(self.epoch, max_epoch))
    while self.epoch <= max_epoch:
      msg = {}
      adjust_learning_rate(self.optimizer_G, iteration)
      adjust_learning_rate(self.optimizer_D, iteration)
      for i, (avatar_tag, avatar_img) in enumerate(self.data_loader):
        iteration += 1
        if avatar_img.shape[0] != batch_size:
          logging.warn('Batch size not satisfied. Ignoring.')
          continue
        if verbose:
          if iteration % verbose_T == 0:
            msg['epoch'] = int(self.epoch)
            msg['step'] = int(i)
            msg['iteration'] = iteration
        avatar_img = Variable(avatar_img).to(device)
        avatar_tag = Variable(torch.FloatTensor(avatar_tag)).to(device)
        # D : G = 2 : 1
        # 1. Training D
        # 1.1. use really image for discriminating
        self.D.zero_grad()
        label_p, tag_p = self.D(avatar_img)
        label.data.fill_(1.0)

        # 1.2. real image's loss
        real_label_loss = self.label_criterion(label_p, label)
        real_tag_loss = self.tag_criterion(tag_p, avatar_tag)
        real_loss_sum = real_label_loss * lambda_adv / 2.0 + real_tag_loss * lambda_adv / 2.0
        real_loss_sum.backward()
        if verbose:
          if iteration % verbose_T == 0:
            msg['discriminator real loss'] = float(real_loss_sum)

        # 1.3. use fake image for discriminating
        g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device)
        fake_feat = torch.cat([g_noise, fake_tag], dim=1)
        fake_img = self.G(fake_feat).detach()
        fake_label_p, fake_tag_p = self.D(fake_img)
        label.data.fill_(.0)

        # 1.4. fake image's loss
        fake_label_loss = self.label_criterion(fake_label_p, label)
        fake_tag_loss = self.tag_criterion(fake_tag_p, fake_tag)
        fake_loss_sum = fake_label_loss * lambda_adv / 2.0 + fake_tag_loss * lambda_adv / 2.0
        fake_loss_sum.backward()
        if verbose:
          if iteration % verbose_T == 0:
            msg['discriminator fake loss'] = float(fake_loss_sum)

        # 1.5. gradient penalty
        # https://github.com/jfsantos/dragan-pytorch/blob/master/dragan.py
        alpha_size = [1] * avatar_img.dim()
        alpha_size[0] = avatar_img.size(0)
        alpha = torch.rand(alpha_size).to(device)
        x_hat = Variable(alpha * avatar_img.data + (1 - alpha) * \
                         (avatar_img.data + 0.5 * avatar_img.data.std() * Variable(torch.rand(avatar_img.size())).to(device)),
                         requires_grad=True).to(device)
        pred_hat, pred_tag = self.D(x_hat)
        gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()).to(device),
                         create_graph=True, retain_graph=True, only_inputs=True)[0].view(x_hat.size(0), -1)
        gradient_penalty = lambda_gp * ((gradients.norm(2, dim=1) - 1) ** 2).mean()
        gradient_penalty.backward()
        if verbose:
          if iteration % verbose_T == 0:
            msg['discriminator gradient penalty'] = float(gradient_penalty)

        # 1.6. update optimizer
        self.optimizer_D.step()

        # 2. Training G
        # 2.1. generate fake image
        self.G.zero_grad()
        g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device)
        fake_feat = torch.cat([g_noise, fake_tag], dim=1)
        fake_img = self.G(fake_feat)
        fake_label_p, fake_tag_p = self.D(fake_img)
        label.data.fill_(1.0)

        # 2.2. calc loss
        label_loss_g = self.label_criterion(fake_label_p, label)
        tag_loss_g = self.tag_criterion(fake_tag_p, fake_tag)
        loss_g = label_loss_g  * lambda_adv / 2.0 + tag_loss_g * lambda_adv / 2.0
        loss_g.backward()
        if verbose:
          if iteration % verbose_T == 0:
            msg['generator loss'] = float(loss_g)

        # 2.2. update optimizer
        self.optimizer_G.step()

        if verbose:
          if iteration % verbose_T == 0:
            logger.info('------------------------------------------')
            for key in msg.keys():
              logger.info('{} : {}'.format(key, msg[key]))
        # save intermediate file
        if iteration % verbose_T == 0:
          vutils.save_image(avatar_img.data.view(batch_size, 3, avatar_img.size(2), avatar_img.size(3)),
                            os.path.join(tmp_path, 'real_image_{}.png'.format(str(iteration).zfill(8))))
          g_noise, fake_tag = utils.fake_generator(batch_size, noise_size, device)
          fake_feat = torch.cat([g_noise, fake_tag], dim=1)
          fake_img = self.G(fake_feat)
          vutils.save_image(fake_img.data.view(batch_size, 3, avatar_img.size(2), avatar_img.size(3)),
                            os.path.join(tmp_path, 'fake_image_{}.png'.format(str(iteration).zfill(8))))
          logger.info('Saved intermediate file in {}'.format(os.path.join(tmp_path, 'fake_image_{}.png'.format(str(iteration).zfill(8)))))
      # dump checkpoint
      torch.save({
        'epoch': self.epoch,
        'D': self.D.state_dict(),
        'G': self.G.state_dict(),
        'optimizer_D': self.optimizer_D.state_dict(),
        'optimizer_G': self.optimizer_G.state_dict(),
      }, '{}/checkpoint_{}.tar'.format(model_dump_path, str(self.epoch).zfill(4)))
      logger.info('Checkpoint saved in: {}'.format('{}/checkpoint_{}.tar'.format(model_dump_path, str(self.epoch).zfill(4))))
      self.epoch += 1
Esempio n. 57
0
interp = (batches[0] + batches[1])/2.0

h1 = C.compute_h2(batches[0])
h2 = C.compute_h2(batches[1])

print h1.size()

h1_interp = 0.5*h1 + 0.5*h2

starting_x = Variable(batches[0].data, requires_grad=True)

for iteration in range(0,200):
    curr_h = C.compute_h2(starting_x)
    loss = torch.sum((h1_interp - curr_h)**2) / h1_interp.size(0)
    g = grad(loss, starting_x)[0]
    new_x = starting_x - 1.0 * g / g.norm(2)
    starting_x = Variable(new_x.data, requires_grad=True)
    print "loss", loss

print C(starting_x)[0:10]

save_image(denorm(batches[0].data), 'interpolation_images/batch1.png')
save_image(denorm(batches[1].data), 'interpolation_images/batch2.png')
save_image(denorm(interp.data), 'interpolation_images/visible_interp.png')
save_image(denorm(starting_x.data), 'interpolation_images/hidden_interp.png')


#print C(train).max(1)
#print target
Esempio n. 58
0
            y = LeNet5(data)
            _, pred = torch.max(y, dim=1)
            num_errs += torch.sum(pred != target)
    return num_errs.item() / len(test_loader.dataset)


Qs = [[torch.eye(W.shape[0]), torch.eye(W.shape[1])] for W in Ws]
step_size = 0.1
grad_norm_clip_thr = 0.1 * sum(W.shape[0] * W.shape[1] for W in Ws)**0.5
TrainLoss, TestLoss = [], []
for epoch in range(10):
    trainloss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        loss = train_loss(data, target)

        grads = grad(loss, Ws, create_graph=True)
        trainloss += loss.item()

        v = [torch.randn(W.shape) for W in Ws]
        Hv = grad(
            grads, Ws, v
        )  #error? check torch bug: https://github.com/pytorch/pytorch/issues/15532
        with torch.no_grad():
            Qs = [
                psgd.update_precond_kron(q[0], q[1], dw, dg)
                for (q, dw, dg) in zip(Qs, v, Hv)
            ]
            pre_grads = [
                psgd.precond_grad_kron(q[0], q[1], g)
                for (q, g) in zip(Qs, grads)
            ]
Esempio n. 59
0
def mutate_sm(mutation,params,
                       model=None,
                       env=None,
                       verbose=False,
                       states=None,
                       mag=0.1,
					 	**kwargs):

    model.inject_parameters(params.copy())

    #TODO: why?
    _states = np.concatenate((states,states,states,states))

    #grab old policy
    sz = min(100,len(_states))

    #experience in this domain = the classification *input* patterns  
    experience_states = _states
    experience_states = Variable(torch.from_numpy(experience_states), requires_grad=False)

    #old_policy in this domain = the outputs this model generated before perturbation
    old_policy = model(experience_states)

    num_classes = old_policy.size()[1]

    #SM-ABS
    abs_gradient=False 

    #SM-SO
    second_order=False

    #SM-R
    sm_r = False

    #SM-R uses a line search 
    linesearch=False


    if mutation.count("SM-R")>0:
        sm_r = True
    elif mutation.count("SO")>0:
        second_order=True
    elif mutation.count("ABS")>0:
        abs_gradient=True

    #initial perturbation
    delta = np.random.randn(*params.shape).astype(np.float32)*mag

    if sm_r:
        #print "SM-R"
        scaling = torch.ones(params.shape)
        linesearch = True
    elif second_order:
        #print "SM-G-SO"
        np_copy = np.array(old_policy.data.numpy(),dtype=np.float32)
        _old_policy_cached = Variable(torch.from_numpy(np_copy), requires_grad=False)

        #loss = a measure of squared divergence from the old policy
        loss =  ((old_policy-_old_policy_cached)**2).sum(1).mean(0)

        #take a first derivative
        loss_gradient = grad(loss, model.parameters(), create_graph=True)
        flat_gradient = torch.cat([grads.view(-1) for grads in loss_gradient]) #.sum()

        #choose a perturbation direction
        direction = (delta/ np.sqrt((delta**2).sum()))
        direction_t = Variable(torch.from_numpy(direction),requires_grad=False)

        #calculate second derivative along perturbation direction
        grad_v_prod = (flat_gradient * direction_t).sum()
        second_deriv = torch.autograd.grad(grad_v_prod, model.parameters())

        #extract a contiguous version of the second derivative 
        sensitivity = torch.cat([g.contiguous().view(-1) for g in second_deriv])

        #return our re-scaling based on second order sensitivity     
        scaling = torch.sqrt(torch.abs(sensitivity).data)

    elif not abs_gradient:
        #print "SM-G-SUM"
        tot_size = model.count_parameters()

        #we want to calculate a jacobian of derivatives of each output's sensitivity to each parameter
        jacobian = torch.zeros(num_classes, tot_size)
        grad_output = torch.zeros(*old_policy.size())

        #do a backward pass for each output
        for i in range(num_classes):
            model.zero_grad()
            grad_output.zero_()
            grad_output[:, i] = 1.0

            old_policy.backward(grad_output, retain_variables=True)
            jacobian[i] = torch.from_numpy(model.extract_grad())

        #summed gradients sensitivity
        scaling = torch.sqrt(  (jacobian**2).sum(0) )
    else:
        #print "SM-G-ABS"
        tot_size = model.count_parameters()
        jacobian = torch.zeros(num_classes, tot_size, sz)
        grad_output = torch.zeros(*old_policy.size())

        for i in range(num_classes):
            for j in range(sz):
                old_policy_new = model(experience_states[j:j+1]) 
                model.zero_grad() 	
                grad_output.zero_()

                grad_output[:, i] = 1.0/sz

                old_policy_new.backward(grad_output, retain_variables=True)
                jacobian[i,:,j] = torch.from_numpy(model.extract_grad())

        mean_abs_jacobian = torch.abs(jacobian).mean(2)
        scaling = torch.sqrt( (mean_abs_jacobian**2).sum(0))

    scaling = scaling.numpy()

    if verbose:
        print 'scaling sum',scaling.sum()
    
    scaling[scaling==0]=1.0
    scaling[scaling<0.01]=0.01
    
    old_delta = delta.copy()
    delta /= scaling
    new_params = params+delta
    model.inject_parameters(new_params)

    threshold = mag
    weight_clip = 10.0 #note generally probably should be smaller
    search_rounds = 15
    old_policy = old_policy.data.numpy()

    #error function for SM-R to line search over 
    #requires one forward pass for each iteration of line search
    def search_error(x,raw=False):
        final_delta = delta*x
        final_delta = np.clip(final_delta,-weight_clip,weight_clip)
        new_params = params + final_delta
        model.inject_parameters(new_params)

        output = model(experience_states).data.numpy()

        change = np.sqrt(((output - old_policy)**2).sum(1)).mean()

        if raw:
            return change

        return np.sqrt(change-threshold)**2

    #do line search for SM-R to tune mutation
    if linesearch:
        mult = minimize_scalar(search_error,bounds=(0,0.1,3),tol=(threshold/4),options={'maxiter':search_rounds,'disp':True})
        new_params = params+delta*mult.x
        chg_amt = mult.x
    else:
        chg_amt = 1.0

    final_delta = delta*chg_amt
    #limit extreme weight changes for stability
    final_delta = np.clip(final_delta,-weight_clip,weight_clip)  #as 1.0

    #generate new parameter vector
    new_params = params + final_delta

    if verbose:
        print 'delta max:',final_delta.max()
        print("divergence:", check_policy_change(params,new_params,model,states))
        print(new_params.shape,params.shape)
    diff = np.sqrt(((new_params - params)**2).sum())
    if verbose:
        print("diff: ", diff)

    return new_params.copy(),final_delta