Esempio n. 1
0
class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2):
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        #self.noise = OUNoise(out_actor, scale=1.0 )
        self.noise = RNoise(out_actor, 0.5)
        
        self.epsilon = 1.
        self.epsilon_decay_rate = 0.999
        self.epsilon_min = 0.2

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0.0)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0)
        
    def epsilon_decay(self):
        self.epsilon = max(self.epsilon_decay_rate*self.epsilon, self.epsilon_min)

    def act(self, obs, rand=0., add_noise=True):
        if np.random.random() < rand:
            action = np.random.randn(2)                    # select an action (for each agent)
            action = np.clip(action, -1, 1)                  # all actions between -1 and 1
            action = torch.tensor(action, dtype=torch.float)
        else:
            obs = obs.to(device)
            
            self.actor.eval()
            with torch.no_grad():
                action = self.actor(obs)
            self.actor.train()
            if add_noise:
                action += self.epsilon * self.noise.noise()
                action = action.squeeze(0)

        return action
    
    def reset(self):
        self.noise.reset()

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        return action
Esempio n. 2
0
class DDPGAgent:
    def __init__(self, state_size, action_size, hidden_layers, gamma, tau,
                 lr_actor, lr_critic, weight_decay, seed):
        """Initialize DDPG agent."""
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.actor = Network(state_size,
                             hidden_layers[0],
                             hidden_layers[1],
                             action_size,
                             seed,
                             actor=True).to(device)
        self.critic = Network(2 * (state_size + action_size), hidden_layers[2],
                              hidden_layers[3], 1, seed).to(device)
        self.target_actor = Network(state_size,
                                    hidden_layers[0],
                                    hidden_layers[1],
                                    action_size,
                                    seed,
                                    actor=True).to(device)
        self.target_critic = Network(2 * (state_size + action_size),
                                     hidden_layers[2], hidden_layers[3], 1,
                                     seed).to(device)

        self.noise = OUNoise(action_size, seed, scale=1.0)
        '''
        # initialize targets same as original networks
        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)
        '''

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay)

    def act(self, state):
        """Calculate actions under current policy for a specific agent."""
        state = torch.from_numpy(state).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        action += self.noise.noise()
        return np.clip(action, -1, 1)

    def step_learn(self, experiences):
        """Update actor and critic using sampled experiences."""

        # states_list: list (length of num_agents) of 2D tensors (batch_size * state_size)
        # action_list: list (length of num_agents) of 2D tensors (batch_size * action_size)
        # rewards: 2D tensors (batch_size * num_agents)
        # next_states_list: list (length of num_agents) of 2D tensors (batch_size * state_size)
        # dones: 2D tensors (batch_size * num_agents)
        states_list, actions_list, rewards, next_states_list, dones = experiences

        next_full_states = torch.cat(next_states_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*state_size))
        full_states = torch.cat(states_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*state_size))
        full_actions = torch.cat(actions_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*action_size))

        # update critic
        next_actions_list = [
            self.target_actor(states) for states in states_list
        ]
        next_full_actions = torch.cat(next_actions_list, dim=1).to(device)
        Q_target_next = self.target_critic(
            next_full_states, next_full_actions)  # 2D tensor (batch_size * 1)
        '''
        Q_target = rewards[:, idx_agent].view(-1, 1) + \
            self.gamma * Q_target_next * (1.0 - dones[:, idx_agent].view(-1, 1))
        '''
        Q_target = rewards + (self.gamma * Q_target_next * (1.0 - dones))
        Q_predict = self.critic(full_states,
                                full_actions)  # 2D tensor (batch_size * 1)
        critic_loss = F.mse_loss(Q_predict, Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        '''
        action_pred = self.actor(states_list[idx_agent])     # 2D tensor (batch_size * action_size)
        actions_list_update = actions_list.copy()
        actions_list_update[idx_agent] = action_pred
        full_actions_update = torch.cat(actions_list_update, dim = 1).to(device)   # 2D tensor (batch_size * (num_agents*action_size))
        '''
        actions_pred = [self.actor(states) for states in states_list]
        actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device)
        # actor_loss = -self.critic(full_states, full_actions_update).mean()
        actor_loss = -self.critic(full_states, actions_pred_tensor).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # soft update target networks
        self.soft_update(self.target_critic, self.critic, self.tau)
        self.soft_update(self.target_actor, self.actor, self.tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def reset(self):
        self.noise.reset()
class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-4):
        super(DDPGAgent, self).__init__()
        
        self.state_size = in_actor
        self.action_size = out_actor 

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0 )
#         self.noise = OUNoise(action_size) #single agent only
        self.noise_scale = NOISE_START

        
        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)


#     def act(self, obs, noise=0.0):
#         obs = obs.to(device)
#         action = self.actor(obs) + noise*self.noise.noise().to(device)
#         return action
    
    def act(self, states, i_episode, add_noise=True):
        """Returns actions for given state as per current policy."""
        
        if self.noise_scale > NOISE_END:
            #self.noise_scale *= NOISE_REDUCTION
            self.noise_scale = NOISE_REDUCTION**(i_episode-EPISODES_BEFORE_TRAINING)
        #else keep the previous value
        
        if not add_noise:
            self.noise_scale = 0.0
                                    
#         states = torch.from_numpy(states).float().to(DEVICE)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states)#.cpu().data.numpy()
        self.actor.train()
        
        #add noise
        actions += self.noise_scale*self.add_noise2() #works much better than OU Noise process
#         actions += self.noise_scale*self.noise.noise()
        
        return actions

    def add_noise2(self):
#         noise = 0.5*np.random.randn(self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped
        noise = 0.5*torch.rand(self.action_size).to(device)
        return noise
    
    def reset(self):
        self.noise.reset()

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise().to(device)
        return action
Esempio n. 4
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_in_dim,
                 hidden_out_dim,
                 extrem_out=64,
                 num_agents=2,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()
        critic_state_size = state_size * num_agents
        critic_action_size = (action_size * (num_agents))
        self.actor = Network(state_size,
                             action_size,
                             hidden_in_dim,
                             hidden_out_dim,
                             hidden_extrem_out=extrem_out,
                             actor=True).to(device)
        self.critic = Network(critic_state_size,
                              critic_action_size,
                              hidden_in_dim,
                              hidden_out_dim,
                              hidden_extrem_out=extrem_out).to(device)
        self.target_actor = Network(state_size,
                                    action_size,
                                    hidden_in_dim,
                                    hidden_out_dim,
                                    hidden_extrem_out=extrem_out,
                                    actor=True).to(device)
        self.target_critic = Network(critic_state_size,
                                     critic_action_size,
                                     hidden_in_dim,
                                     hidden_out_dim,
                                     hidden_extrem_out=extrem_out).to(device)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=0)

        print("critic", self.critic, self.target_critic, "optim",
              self.critic_optimizer)
        print("actor", self.actor, self.target_actor, "optim",
              self.actor_optimizer)

    def act(self, obs, noise=0.0, batch=True):
        obs = obs.to(device)
        self.actor.eval()
        act = self.actor(obs, batch=batch).cpu().data
        no = noise * self.noise.noise()
        #print( "act" , act , "noise" , no)
        action = act + no
        self.actor.train()
        return np.clip(action, -1, 1)

    def target_act(self, obs, noise=0.0, batch=True):
        obs = obs.to(device)
        self.target_actor.eval()
        action = self.target_actor(
            obs, batch=batch).cpu().data + noise * self.noise.noise()
        self.target_actor.training()
        return np.clip(action, -1, 1)