Ejemplo n.º 1
0
class Agent():
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                    (1-tau)*target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
Ejemplo n.º 2
0
class Agent_sm():
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=8,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = 0.99
        self.tau = tau
        self.memeory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  env.action_space.high,
                                  n_actions=n_actions)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = torch.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memeory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_dict = dict(target_value_params)
        value_dict = dict(value_params)

        for name in target_value_dict:
            target_value_dict[name] = tau*value_dict[name].clone() + \
                (1-tau)*target_value_dict[name].clone()

        self.target_value.load_state_dict(target_value_dict)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()

    def learn(self):
        if self.memeory.mem_cntr < self.batch_size:
            return

        states, new_states, actions, rewards, dones = self.memeory.sample_buffer(
            self.batch_size)
        states = torch.tensor(states, dtype=torch.float).to(self.actor.device)
        new_states = torch.tensor(new_states,
                                  dtype=torch.float).to(self.actor.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.actor.device)
        rewards = torch.tensor(rewards,
                               dtype=torch.float).to(self.actor.device)
        dones = torch.tensor(dones).to(self.actor.device)

        states_value = self.value(states).view(-1)
        new_states_value = self.target_value(new_states).view(-1)
        new_states_value[dones] = 0.0

        action, log_probs = self.actor.sample_normal(states,
                                                     reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(states, action)
        q2_new_policy = self.critic_2(states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(states_value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        action, log_probs = self.actor.sample_normal(states,
                                                     reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(states, action)
        q2_new_policy = self.critic_2(states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q_hat = self.scale * rewards + self.gamma * new_states_value
        q1_old_policy = self.critic_1(states, actions).view(-1)
        q2_old_policy = self.critic_2(states, actions).view(-1)
        critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic1_loss + critic2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
        #         value_loss = value_loss.cpu().detach().numpy()[0]
        #         actor_loss = actor_loss.cpu().detach().numpy()[0]
        #         critic_loss = critic_loss.cpu().detach().numpy()[0]

        return 0, value_loss, actor_loss, critic_loss

    def learn_sm(self, sm_reg=1):
        if self.memeory.mem_cntr < self.batch_size:
            return

        states, new_states, actions, rewards, dones = self.memeory.sample_buffer(
            self.batch_size)
        states = torch.tensor(states, dtype=torch.float).to(self.actor.device)
        new_states = torch.tensor(new_states,
                                  dtype=torch.float).to(self.actor.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.actor.device)
        rewards = torch.tensor(rewards,
                               dtype=torch.float).to(self.actor.device)
        dones = torch.tensor(dones).to(self.actor.device)

        states_value = self.value(states).view(-1)
        new_states_value = self.target_value(new_states).view(-1)
        new_states_value[dones] = 0.0

        #         action, log_probs = self.actor.sample_normal(states, reparameterize=False)
        #         log_probs = log_probs.view(-1)
        #         q1_new_policy = self.critic_1(states, action)
        #         q2_new_policy = self.critic_2(states, action)
        #         critic_value = torch.min(q1_new_policy, q2_new_policy)
        #         critic_value = critic_value.view(-1)

        #         self.value.optimizer.zero_grad()
        #         value_target = critic_value - log_probs
        #         value_loss = 0.5 * F.mse_loss(states_value, value_target)
        #         value_loss.backward(retain_graph=True)
        #         self.value.optimizer.step()

        #         action, log_probs = self.actor.sample_normal(states, reparameterize=True)
        action, _ = self.actor.sample_normal(states, reparameterize=True)
        #         log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(states, action)
        q2_new_policy = self.critic_2(states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        # sample actions for next batch states
        action_next, _ = self.actor.sample_normal(new_states,
                                                  reparameterize=True)
        q1_new_policy = self.critic_1(new_states, action_next)
        q2_new_policy = self.critic_2(new_states, action_next)
        critic_value_next = torch.min(q1_new_policy, q2_new_policy)
        critic_value_next = critic_value.view(-1)

        #         actor_loss = log_probs - critic_value
        actor_loss = -(critic_value + critic_value_next) + sm_reg * F.mse_loss(
            action, action_next)
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        #         self.critic_1.optimizer.zero_grad()
        #         self.critic_2.optimizer.zero_grad()

        #         q_hat = self.scale*rewards + self.gamma*new_states_value
        #         q1_old_policy = self.critic_1(states, actions).view(-1)
        #         q2_old_policy = self.critic_2(states, actions).view(-1)
        #         critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        #         critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)
        #         critic_loss = critic1_loss + critic2_loss
        #         critic_loss.backward()
        #         self.critic_1.optimizer.step()
        #         self.critic_2.optimizer.step()

        #         self.update_network_parameters()

        return 0, 0, actor_loss, 0
Ejemplo n.º 3
0
class Agent_2():
    def __init__(self,
                 alpha=0.00005,
                 beta=0.00005,
                 input_dims=5,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = 0.99
        self.tau = tau
        self.memeory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        latent_dims = 10

        self.actor = ActorNetwork_2(alpha,
                                    latent_dims,
                                    env.action_space.high,
                                    n_actions=n_actions)
        self.critic_1 = CriticNetwork(beta,
                                      latent_dims,
                                      n_actions,
                                      name='critic_det_1')
        self.critic_2 = CriticNetwork(beta,
                                      latent_dims,
                                      n_actions,
                                      name='critic__det_2')
        self.value = ValueNetwork(beta, latent_dims, name='value_det')
        self.target_value = ValueNetwork(beta,
                                         latent_dims,
                                         name='target_value_det')
        self.VAE = LinearVAE()

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = torch.Tensor([observation]).to(self.actor.device)
        state_latent = self.VAE.sample_normal(state)
        actions = self.actor(state_latent)
        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memeory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_dict = dict(target_value_params)
        value_dict = dict(value_params)

        for name in target_value_dict:
            target_value_dict[name] = tau*value_dict[name].clone() + \
                (1-tau)*target_value_dict[name].clone()

        self.target_value.load_state_dict(target_value_dict)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()

    def learn(self):
        if self.memeory.mem_cntr < self.batch_size:
            return

        states, new_states, actions, rewards, dones = self.memeory.sample_buffer(
            self.batch_size)
        states = torch.tensor(states, dtype=torch.float).to(self.actor.device)
        new_states = torch.tensor(new_states,
                                  dtype=torch.float).to(self.actor.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.actor.device)
        rewards = torch.tensor(rewards,
                               dtype=torch.float).to(self.actor.device)
        dones = torch.tensor(dones).to(self.actor.device)

        # Train VAE with KL divergence + reconstruction_loss + log_probs
        reconstruction, mu, logvar, log_probs = self.VAE(states)
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        reconstruction_loss = F.mse_loss(reconstruction, states)
        final_loss = KLD + reconstruction_loss
        self.VAE.optimizer.zero_grad()
        final_loss.backward(retain_graph=True)
        self.VAE.optimizer.step()

        latent_states = self.VAE.sample_normal(states)
        states_value = self.value(latent_states).view(-1)
        new_latent_states = self.VAE.sample_normal(new_states)
        new_states_value = self.target_value(new_latent_states).view(-1)
        new_states_value[dones] = 0.0

        action = self.actor(latent_states)
        q1_new_policy = self.critic_1(latent_states, action)
        q2_new_policy = self.critic_2(latent_states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value
        value_loss = 0.5 * F.mse_loss(states_value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actor_loss = -critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q_hat = self.scale * rewards + self.gamma * new_states_value
        q1_old_policy = self.critic_1(latent_states, actions).view(-1)
        q2_old_policy = self.critic_2(latent_states, actions).view(-1)
        critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic1_loss + critic2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()
        self.update_network_parameters()
        return final_loss, value_loss, actor_loss, critic_loss
Ejemplo n.º 4
0
class Agent():
    def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8],
                 env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005,
                 layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions,
                                  name='actor', max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)                                       #sets the parameters of Target-network equals to the
                                                                                    #values of Target-network in the beginning

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:                                                              #different copies: exact VS soft
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau * value_state_dict[name].clone() + \
                                     (1 - tau) * target_value_state_dict[name].clone()

        self.target_value.load_state_dict(value_state_dict)

    def save_models(self):
        print('.... saving models ....')
        self.actor.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \                                      #takes the batch
            self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.actor.device)                  #trasforms into tensors
        done = T.tensor(done).to(self.actor.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device)
        state = T.tensor(state, dtype=T.float).to(self.actor.device)
        action = T.tensor(action, dtype=T.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(state_).view(-1)
        value_[done] = 0.0                                                              #####_ sta usando 0 per dire True? @15, 17

        actions, log_probs = self.actor.sample_normal(state, reparameterize=False)      #takes the lower Q-values from 2 Critics to the Critic
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs                                         ####_ Perchè non prende semplicemente il critic_value?
        value_loss = 0.5 * F.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state, reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = T.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = T.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.scale * reward + self.gamma * value_                       #The scaling factor takes into account the entropy and encourage exploration
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()
Ejemplo n.º 5
0
class Agent():
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=8,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = 0.99
        self.tau = tau
        self.memeory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha,
                                  input_dims,
                                  env.action_space.high,
                                  n_actions=n_actions)
        self.critic_1 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions,
                                      name='critic_1')
        self.critic_2 = CriticNetwork(beta,
                                      input_dims,
                                      n_actions,
                                      name='critic_2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = torch.Tensor([observation]).to(self.actor.device)
        actions, _ = self.actor.sample_normal(state, reparameterize=False)
        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memeory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_dict = dict(target_value_params)
        value_dict = dict(value_params)

        for name in target_value_dict:
            target_value_dict[name] = tau*value_dict[name].clone() + \
                (1-tau)*target_value_dict[name].clone()

        self.target_value.load_state_dict(target_value_dict)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value.save_checkpoint()
        self.target_value.save_checkpoint()

    def load_models(self):
        print('... loading models ...')
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value.load_checkpoint()
        self.target_value.load_checkpoint()

    def learn(self):
        if self.memeory.mem_cntr < self.batch_size:
            return

        states, new_states, actions, rewards, dones = self.memeory.sample_buffer(
            self.batch_size)
        states = torch.tensor(states, dtype=torch.float).to(self.actor.device)
        new_states = torch.tensor(new_states,
                                  dtype=torch.float).to(self.actor.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.actor.device)
        rewards = torch.tensor(rewards,
                               dtype=torch.float).to(self.actor.device)
        dones = torch.tensor(dones).to(self.actor.device)

        states_value = self.value(states).view(-1)
        new_states_value = self.target_value(new_states).view(-1)
        new_states_value[dones] = 0.0

        action, log_probs = self.actor.sample_normal(states,
                                                     reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(states, action)
        q2_new_policy = self.critic_2(states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * F.mse_loss(states_value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.optimizer.step()

        action, log_probs = self.actor.sample_normal(states,
                                                     reparameterize=True)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1(states, action)
        q2_new_policy = self.critic_2(states, action)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q_hat = self.scale * rewards + self.gamma * new_states_value
        q1_old_policy = self.critic_1(states, actions).view(-1)
        q2_old_policy = self.critic_2(states, actions).view(-1)
        critic1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic1_loss + critic2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters()

        return value_loss, actor_loss, critic_loss

    def train_on_env(self, env):
        rewards = []
        done = False
        observation = env.reset()
        while not done:
            action = self.choose_action(observation)
            observation_, reward, done, _ = env.step(action)
            self.remember(observation, action, reward, observation_, done)
            #if not load_checkpoints:
            self.learn()
            observation = observation_
            rewards.append(reward)
        return np.sum(rewards)

    def generate_session(self, env, t_max=1000):
        states, traj_probs, actions, rewards = [], [], [], []
        s = env.reset()
        q_t = 0
        for t in range(t_max):
            state = torch.Tensor([s]).to(self.actor.device)
            action, log_probs = self.actor.sample_normal(state,
                                                         reparameterize=False)
            action = action.cpu().detach().numpy()[0]

            new_s, r, done, info = env.step(action)

            log_probs = log_probs.cpu().detach().numpy()[0]
            #q_t *= probs
            q_t += log_probs[0]
            states.append(s.tolist())
            traj_probs.append(q_t)
            actions.append(action[0])
            rewards.append(r)

            s = new_s
            if done:
                break

        return np.array(states), np.array(traj_probs), np.array(
            actions), np.array(rewards)
Ejemplo n.º 6
0
class Agent():
    def __init__(self, input_dims, env, n_actions):
        self.memory = ReplayBuffer(input_dims)
        self.n_actions = n_actions

        self.actor_nn = ActorNetwork(input_dims,
                                     n_actions=n_actions,
                                     name=Constants.env_id + '_actor',
                                     max_action=env.action_space.n)
        self.critic_local_1_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_1')
        self.critic_local_2_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_2')
        self.critic_target_1_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_1')
        self.critic_target_2_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_2')
        self.value_nn = ValueNetwork(input_dims,
                                     name=Constants.env_id + '_value')
        self.target_value_nn = ValueNetwork(input_dims,
                                            name=Constants.env_id +
                                            '_target_value')
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(Constants.device)
        _, max_probability_action = self.actor_nn.sample_action(state)
        return max_probability_action

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_cntr < Hyper.batch_size:
            return

        state, action, reward, next_state, done = self.memory.sample_buffer()

        reward = T.tensor(reward, dtype=T.float).to(Constants.device)
        done = T.tensor(done).to(Constants.device)
        next_state = T.tensor(next_state, dtype=T.float).to(Constants.device)
        state = T.tensor(state, dtype=T.float).to(Constants.device)
        action = T.tensor(action, dtype=T.float).to(Constants.device)

        # value_from_nn = self.value_nn(state).view(-1)
        value_from_nn = self.value_nn(state)
        new_value_from_nn = self.target_value_nn(next_state).view(-1)
        new_value_from_nn[done] = 0.0

        (action_probabilities,
         log_action_probabilities), _ = self.actor_nn.sample_action(next_state)
        with T.no_grad():
            q1_new_policy = self.critic_target_1_nn(next_state)
            q2_new_policy = self.critic_target_2_nn(next_state)
            critic_value = T.min(q1_new_policy, q2_new_policy)

        self.value_nn.optimizer.zero_grad()
        # CHANGE0003 Soft state-value where actions are discrete
        inside_term = Hyper.alpha * log_action_probabilities - critic_value
        #value_target = action_probabilities * (critic_value - Hyper.alpha * log_action_probabilities)
        value_loss = (action_probabilities * inside_term).sum(dim=1).mean()
        value_loss.backward(retain_graph=True)
        self.value_nn.optimizer.step()

        (action_probabilities,
         log_action_probabilities), _ = self.actor_nn.sample_action(state)
        with T.no_grad():
            q1_new_policy = self.critic_local_1_nn(state)
            q2_new_policy = self.critic_local_1_nn(state)
            critic_value = T.min(q1_new_policy, q2_new_policy)

        # CHANGE0005 Objective for policy
        actor_loss = action_probabilities * (
            Hyper.alpha * log_action_probabilities - critic_value)
        actor_loss = T.mean(actor_loss)
        self.actor_nn.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor_nn.optimizer.step()

        self.critic_local_1_nn.optimizer.zero_grad()
        self.critic_local_2_nn.optimizer.zero_grad()
        q_hat = Hyper.reward_scale * reward + Hyper.gamma * new_value_from_nn
        action_logits1 = self.critic_local_1_nn(state)
        q1_old_policy = T.argmax(action_logits1, dim=1, keepdim=True).view(-1)
        action_logits2 = self.critic_local_2_nn(state)
        q2_old_policy = T.argmax(action_logits2, dim=1, keepdim=True).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_local_1_nn.optimizer.step()
        self.critic_local_2_nn.optimizer.step()
        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = Hyper.tau

        target_value_params = self.target_value_nn.named_parameters()
        value_params = self.value_nn.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)
        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                    (1-tau)*target_value_state_dict[name].clone()

        self.target_value_nn.load_state_dict(value_state_dict)

        self.update_network_parameters_line(
            self.critic_target_1_nn.named_parameters(),
            self.critic_local_1_nn.named_parameters(), tau)
        self.update_network_parameters_line(
            self.critic_target_2_nn.named_parameters(),
            self.critic_local_2_nn.named_parameters(), tau)

    def update_network_parameters_line(self, target_params, local_params, tau):
        for target_param, local_param in zip(target_params, local_params):
            target_param[1].data.copy_(tau * local_param[1].data +
                                       (1.0 - tau) * target_param[1].data)

    def save_models(self):
        print('.... saving models ....')
        self.actor_nn.save_checkpoint()
        self.value_nn.save_checkpoint()
        self.target_value_nn.save_checkpoint()
        self.critic_local_1_nn.save_checkpoint()
        self.critic_local_2_nn.save_checkpoint()
        self.critic_target_1_nn.save_checkpoint()
        self.critic_target_2_nn.save_checkpoint()

    def load_models(self):
        print('.... loading models ....')
        self.actor_nn.load_checkpoint()
        self.value_nn.load_checkpoint()
        self.target_value_nn.load_checkpoint()
        self.critic_local_1_nn.load_checkpoint()
        self.critic_local_2_nn.load_checkpoint()
        self.critic_target_1_nn.load_checkpoint()
        self.critic_target_2_nn.load_checkpoint()