Beispiel #1
0
    def forward(self, x):
        b = x.ndimension() > 1
        if not b:
            x = x.unsqueeze(0)
        assert x.ndimension() == 2
        nbatch = x.size(0)

        y_init = torch.zeros(nbatch,
                             self.Enet.n_out,
                             device=x.device,
                             requires_grad=True)
        if (self.init_scheme == 'gt'): y_init = (x * torch.sin(x)).clone()

        y = Variable(y_init.data, requires_grad=True)
        inner_opt = higher.get_diff_optim(torch.optim.SGD([y],
                                                          lr=self.inner_lr),
                                          [y],
                                          device=x.device)

        for _ in range(self.n_inner_iter):
            E = self.Enet(x, y)
            E = torch.square(E)
            y, = inner_opt.step(E.sum(), params=[y])

        return y
Beispiel #2
0
def ddpg_update(config, fnet_actor, diff_act_opt, models, optimizers, memory_cache, update_type='meta'):
    summed_policy_loss = torch.zeros(1)
    summed_value_loss = torch.zeros(1)
    diff_crit_opt = higher.get_diff_optim(optimizers['critic_opt'], models['critic'].parameters(),
                                          track_higher_grads=False)
    diff_act_opt_internal = higher.create_diff_optim(torch.optim.SGD, fmodel=fnet_actor, track_higher_grads=True,
                                                     opt_kwargs={'lr': config.actor_rl_learning_rate, 'momentum': 0.01})
    for it in range(config.offpol_num_iterations_update):
        states, next_states, actions_init, rewards, dones, _ = get_shaped_memory_sample(config, memory_cache)
        inverted_dones = 1 - dones
        rewards = rewards.view(-1, 1)
        inverted_dones = inverted_dones.view(-1, 1)
        target_Q = models['critic_target'](next_states, models['actor_target'](next_states))
        target_Q = rewards + (inverted_dones * config.discount_factor * target_Q).detach()

        current_Q = models['critic'](states, actions_init)

        torch.autograd.set_detect_anomaly(True)
        critic_loss = F.mse_loss(current_Q, target_Q)

        diff_crit_opt.step(critic_loss, models['critic'].parameters())
        summed_value_loss += critic_loss

        actor_loss = -models['critic'](states, fnet_actor(states)).mean()
        # Optimize the actor # Functionally

        diff_act_opt.step(actor_loss)
        summed_policy_loss += actor_loss

        # Update the frozen target models
        # Critic is as before.
        for param, target_param in zip(models['critic'].parameters(), models['critic_target'].parameters()):
            target_param.data.copy_(config.offpol_target_update_rate * param.data + (1 - config.offpol_target_update_rate) * target_param.data)

        if not (fnet_actor.dim == models['actor_target'].dim):
            # print('Changing target actor to resemble fnet_actor')
            models['actor_target'] = SequentialActor(config, fnet_actor.state_scaler, fnet_actor.state_normalizer,
                                                     dim=fnet_actor.dim,
                                                     num_layers=fnet_actor.num_layers).reset_weights()
        # For actor, performing a bit of a unwieldier approach.
        for param, target_param in zip(fnet_actor.parameters(), models['actor_target'].parameters()):
            target_param.data.copy_(config.offpol_target_update_rate * param.data +
                                    (1 - config.offpol_target_update_rate) * target_param.data)

    return summed_policy_loss, summed_value_loss.item()
Beispiel #3
0
    def forward(self, x):
        b = x.ndimension() > 1
        if not b:
            x = x.unsqueeze(0)
        assert x.ndimension() == 2
        nbatch = x.size(0)

        y = torch.zeros(nbatch,
                        self.Enet.n_out,
                        device=x.device,
                        requires_grad=True)

        inner_opt = higher.get_diff_optim(torch.optim.SGD([y],
                                                          lr=self.inner_lr),
                                          [y],
                                          device=x.device)

        for _ in range(self.n_inner_iter):
            E = self.Enet(x, y)
            y, = inner_opt.step(E.sum(), params=[y])

        return y
Beispiel #4
0
    def solve(self, xinit):
        assert xinit.ndimension() == 2

        nbatch = xinit.size(0)
        z = torch.zeros(nbatch,
                        self.latent_size,
                        device=xinit.device,
                        requires_grad=True)

        inner_opt = higher.get_diff_optim(torch.optim.SGD(
            [z], lr=self.inner_optim_opts.lr), [z],
                                          device=xinit.device)

        f_emb = self.get_cost_f(xinit)
        for _ in range(self.inner_optim_opts.n_iter):
            cost = f_emb(z)
            z, = inner_opt.step(cost.sum(), params=[z])

        us = self.decode(z)
        rews, xs = rew_nominal(self.dx, xinit, us)
        cost = -rews
        return z, cost
Beispiel #5
0
    def forward(self, x):
        assert x.ndimension() == 4
        nbatch = x.size(0)

        # Make an initial guess of the labels.
        # For more sophisticated tasks this could also be learned.
        y = torch.zeros(nbatch,
                        self.n_cls,
                        device=x.device,
                        requires_grad=True)

        # Define a differentiable optimizer to update the label with.
        inner_opt = higher.get_diff_optim(torch.optim.SGD([y], lr=1e-1), [y],
                                          device=x.device)

        # Take a few gradient steps to find the labels that
        # optimize the energy function.
        for _ in range(self.n_inner_iter):
            E = self.Enet(x, y)
            y, = inner_opt.step(E.sum(), params=[y])

        return y
Beispiel #6
0
def td3_update(config, fnet_actor, actor_rl_opt, models, optimizers, memory_cache, update_type='meta'):
    summed_policy_loss = torch.zeros(1)
    summed_value_loss = torch.zeros(1)
    optimizers['critic_opt'].zero_grad()
    diff_crit_opt = higher.get_diff_optim(optimizers['critic_opt'], models['critic'].parameters(), track_higher_grads=False)
    diff_crit_opt_2 = higher.get_diff_optim(optimizers['critic_opt_2'], models['critic_2'].parameters(),
                                          track_higher_grads=False)
    diff_act_opt_internal = higher.create_diff_optim(torch.optim.SGD, fmodel=fnet_actor, track_higher_grads=True, opt_kwargs={'lr': config.actor_rl_learning_rate, 'momentum': 0.01})

    # Initially attempted alternate structure with functional critics, abandoned as found to be unnecessary.
    # with higher.innerloop_ctx(models['critic'], optimizers['critic_opt'], copy_initial_weights=False) as (fnet_critic, diff_crit_opt):
    #     with higher.innerloop_ctx(models['critic_2'], optimizers['critic_opt_2'], copy_initial_weights=False) as (fnet_critic_2, diff_crit_opt_2):

    for it in range(config.offpol_num_iterations_update):
        states, next_states, actions_init, rewards, dones, _ = get_shaped_memory_sample(config, memory_cache)
        inverted_dones = 1 - dones

        rewards = rewards.view(-1, 1)
        inverted_dones = inverted_dones.view(-1, 1)

        noise = torch.FloatTensor(actions_init).data.normal_(0, 0.2)
        noise = noise.clamp(-0.5, 0.5)
        next_action = (models['actor_target'](next_states) + noise).clamp(-config.action_space_high[0], config.action_space_high[0])

        target_Q1 = models['critic_target'](next_states, next_action)
        target_Q2 = models['critic_target_2'](next_states, next_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = rewards + (inverted_dones * config.discount_factor * target_Q).detach()
        current_Q1 = models['critic'](states, actions_init)
        current_Q2 = models['critic_2'](states, actions_init)

        torch.autograd.set_detect_anomaly(True)

        critic_loss_1 = F.mse_loss(current_Q1, target_Q)
        critic_loss_2 = F.mse_loss(current_Q2, target_Q)
        diff_crit_opt.step(critic_loss_1,  models['critic'].parameters())
        diff_crit_opt_2.step(critic_loss_2,  models['critic_2'].parameters())

        summed_value_loss += critic_loss_1 + critic_loss_2

        if it % 2 == 0:
            actor_loss = - models['critic'](states, fnet_actor(states)).mean()

            # Optimize the actor # Functionally
            diff_act_opt_internal.step(actor_loss)
            summed_policy_loss += actor_loss

            # Update the frozen target models
            # Critic is as before.
            for param, target_param in zip( models['critic'].parameters(), models['critic_target'].parameters()):
                target_param.data.copy_(config.offpol_target_update_rate * param.data + (
                            1 - config.offpol_target_update_rate) * target_param.data)
            for param, target_param in zip( models['critic_2'].parameters(), models['critic_target_2'].parameters()):
                target_param.data.copy_(config.offpol_target_update_rate * param.data + (
                            1 - config.offpol_target_update_rate) * target_param.data)

            if not (fnet_actor.dim == models['actor_target'].dim):
                # print('Changing target actor to resemble fnet_actor')
                models['actor_target'] = SequentialActor(config, fnet_actor.state_scaler,
                                                         fnet_actor.state_normalizer,
                                                         dim=fnet_actor.dim,
                                                         num_layers=fnet_actor.num_layers).reset_weights()
            # For actor, performing a bit of a unwieldier approach.
            for param, target_param in zip(fnet_actor.parameters(), models['actor_target'].parameters()):
                target_param.data.copy_(config.offpol_target_update_rate * param.data +
                                        (1 - config.offpol_target_update_rate) * target_param.data)

    return summed_policy_loss, summed_value_loss.item()