コード例 #1
0
ファイル: agent.py プロジェクト: alessandroleite/tennis-rl
class DDPGAgent(object):
    """Interacts with and learns from the environment."""
    def __init__(self, id, state_size, action_size, seed, memory, num_agents,
                 hyperparameters: Mapping[str, float]):
        """Initialize a DDPG agent object.
        
        Params
        ======
            id (int): agent's id
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): replay buffer to store the experience of this agent
            hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are:
             - batch_size (int): minibatch size
             - lr_actor (float): learning rate of the actor 
             - lr_critic (float): learning rate of the critic 
             - gamma (float): discount factor
             - weight_decay (float): critic L2 weight decay 
             - tau (float): value for soft update of target parameters
             - update_frequency (int): how much steps must be executed before starting learn
             - n_learns (int): how many learning for update 
        """
        self.id = id
        self.__name__ = 'DDPG'
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = hyperparameters['gamma']
        self.batch_size = int(hyperparameters['batch_size'])
        self.tau = hyperparameters['tau']

        self.update_frequency = int(hyperparameters['update_frequency'])
        self.n_learns = int(hyperparameters['n_learns'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, num_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size, action_size, num_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters['lr_critic'],
            weight_decay=hyperparameters['weight_decay'])

        # Noise process
        self.noise = Ornstein(action_size)

        # Replay memory
        self.memory = memory

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, other_states,
             other_actions, other_next_states):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done, other_states,
                        other_actions, other_next_states)

        self.t_step = (self.t_step + 1) % self.update_frequency
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            for _ in range(self.n_learns):
                if len(self.memory) > self.batch_size:
                    experiences = self.memory.sample(self.batch_size)
                    self.learn(experiences, self.gamma)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, _, _, _, _, other_states, _, _ = experiences

        self.update_critic(experiences, gamma)
        self.update_actor(states, other_states)
        self.update_target_networks()

    def update_critic(self, experiences, gamma):
        """Update the critic network given the experiences"""

        states, actions, rewards, next_states, dones, other_states, other_actions, other_next_states = experiences

        all_states = torch.cat((states, other_states), dim=1).to(device)
        all_actions = torch.cat((actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((next_states, other_next_states),
                                    dim=1).to(device)

        local_all_next_actions = []
        local_all_next_actions.append(self.actor_target(states))
        local_all_next_actions.append(self.actor_target(other_states))
        all_next_actions = torch.cat(local_all_next_actions, dim=1).to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(all_next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, states, other_states):

        all_states = torch.cat((states, other_states), dim=1).to(device)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        other_actions_pred = self.actor_local(other_states)
        other_actions_pred = other_actions_pred.detach()

        actions_pred = torch.cat((actions_pred, other_actions_pred),
                                 dim=1).to(device)
        actor_loss = -self.critic_local(all_states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_target_networks(self):

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #2
0
class DDPGAgent:
    def __init__(self,
                 env,
                 gamma,
                 tau,
                 buffer_maxlen,
                 critic_learning_rate,
                 actor_learning_rate,
                 max_action=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.noise = OUNoise(env.action_space)
        self.iter = 0.0
        self.noisy = False
        self.max_action = max_action

        print(self.action_dim)
        print(self.obs_dim)

        # RL hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # Initialize critic and actorr networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim,
                           self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy target network paramters for critic
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # Set Optimization algorithms
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = ExperienceReplayLog(buffer_maxlen)

    def get_action(self, obs):
        #print('obs;',obs)

        if self.noisy == True:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()
            action = self.noise.get_action(action, self.iter)
            self.iter = self.iter + 1

        else:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):

        #Batch updates
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        # Q info updates
        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # Update Critic network
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()

        self.critic_optimizer.step()

        # Update Actor network
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # Update Actor and Critic target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))