Esempio n. 1
0
class Agent():
    """Interacts with and learns from the environment."""
    memory = None
    actor_local = None
    actor_target = None
    actor_optimizer = None

    critic_local = None
    critic_target = None
    critic_optimizer = None

    instances = []

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # initialize Class level Actor Network
        #         if Agent.actor_local is None:
        #             Agent.actor_local = Actor(state_size, action_size, random_seed).to(device)
        #         if Agent.actor_target is None:
        #             Agent.actor_target = Actor(state_size, action_size, random_seed).to(device)
        #         if Agent.actor_optimizer is None:
        #             Agent.actor_optimizer = optim.Adam(Agent.actor_local.parameters(), lr=LR_ACTOR)
        #         self.actor_local = Agent.actor_local
        #         self.actor_target = Agent.actor_target
        #         self.actor_optimizer = Agent.actor_optimizer

        # Critic Network (w/ Target Network)
        #         self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        #         self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        #         self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Initilise Class levell Critic Network
        if Agent.critic_local is None:
            Agent.critic_local = Critic(state_size, action_size,
                                        random_seed).to(device)
        if Agent.critic_target is None:
            Agent.critic_target = Critic(state_size, action_size,
                                         random_seed).to(device)
        if Agent.critic_optimizer is None:
            Agent.critic_optimizer = optim.Adam(
                Agent.critic_local.parameters(),
                lr=LR_CRITIC,
                weight_decay=WEIGHT_DECAY)
        self.critic_local = Agent.critic_local
        self.critic_target = Agent.critic_target
        self.critic_optimizer = Agent.critic_optimizer

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory - only intitialise once per class
        if Agent.memory is None:
            print("Initialising ReplayBuffer")
            Agent.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                        random_seed)
#         else:
#             print("Sharing ReplayBuffer %s", Agent.memory)

# Add this instances - we need to access all agent states whilst learning
        self.agent_num = len(Agent.instances)
        Agent.instances.append(self)
        print("Appended to Agent.instances agent {}".format(self.agent_num))

    def step(self, time_step, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        Agent.memory.add(state, action, reward, next_state, done)

        # only learn every n_time_steps
        if time_step % N_TIME_STEPS != 0:
            return

        # Learn, if enough samples are available in memory
        if len(Agent.memory) > BATCH_SIZE:
            for i in range(N_LEARN_UPDATES):
                experiences = Agent.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True, noise_amplitude=0.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * noise_amplitude
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 2
0
class DDPGAgent():
    """DDPG agent that interacts with and learns from the environment.

    The agents model is implemented in 'ddpg_model.py'. It consists of two
    neural networks; one for the actor, and one for the critic.

    The DDPGAgent class makes use of two other classes: ReplayBuffer, OUNoise
    """
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Arguments:
            state_size (int) -- dimension of each state
            action_size (int) -- dimension of each action
            num_agents (int) -- number of agents (brains)
            random_seed (int) -- random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        ### Make neural networks (local and target) for both actor and critic, and set optimizers
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Initialize replay memory ###
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience in memory
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])

        # Learn every UPDATE_EVERY time steps
        if timestep % UPDATE_EVERY == 0:
            # If we have collected enough experience in our memory i.e. more
            # than the mini-batch size, then call the self.learn() function
            if len(self.memory) > BATCH_SIZE:
                # Number of updates per timestep
                for _ in range(NUM_UPDATES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy.

        Arguments:
            state {[type]} -- Current state
            add_noise {bool} -- Add noise (exploration) to the actions (default: {True})

        Returns:
            [float] -- Actions
        """

        # Convert 'state' numpy array to pytorch tensor using the current device
        # i.e. GPU or CPU.
        state = torch.from_numpy(state).float().to(device)

        # Set the module in evaluation mode.
        self.actor_local.eval()
        with torch.no_grad():
            # Evaluate the network with the current state
            action = self.actor_local(state).cpu().data.numpy()

        # Set the module in training mode.
        self.actor_local.train()
        if add_noise:
            # Add noise to the actions to add exploration
            action += self.noise.sample()

        # Return the clipped actions
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Arguments:
            experiences {Tuple[torch.Tensor]} -- tuple of (s, a, r, s', done) tuples
            gamma {float} -- discount factor
        """

        # Experiences, mini-batch of 128
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- Update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Clip the gradients
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        # Take one step with the optimizer
        self.critic_optimizer.step()

        # ---------------------------- Update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- Update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Arguments:
            local_model -- PyTorch model (weights will be copied from)
            target_model -- PyTorch model (weights will be copied to)
            tau (float) -- interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 3
0
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 4
0
class Agent():
    def __init__(self, n_state, n_action, n_agents, random_seed, device="cpu"):
        """Initialize an Agent object.
        
        Params
        ------
            n_state : int
                dimension of each state
            n_action : int
                dimension of each action
            random_seed : int
                random seed
            device :
                which device is used, cpu or cuda.
        """
        self.n_state = n_state
        self.n_action = n_action
        self.n_agents = n_agents
        self.random_seed = np.random.seed(random_seed)
        self.device = device

        # Networks for the first agent
        # Local Actor, Local Critic, Target Actor, Target Critic
        self.actor_local1 = Actor(self.n_state, self.n_action,
                                  self.random_seed).to(self.device)
        self.actor_local1.apply(initialize_weights)
        self.critic_local1 = Critic(self.n_state * self.n_agents,
                                    self.n_action * self.n_agents,
                                    self.random_seed).to(self.device)
        self.critic_local1.apply(initialize_weights)
        self.actor_target1 = Actor(self.n_state, self.n_action,
                                   self.random_seed).to(self.device)
        self.actor_target1.apply(initialize_weights)
        self.actor_target1.eval()
        self.critic_target1 = Critic(self.n_state * self.n_agents,
                                     self.n_action * self.n_agents,
                                     self.random_seed).to(self.device)
        self.critic_target1.apply(initialize_weights)
        self.critic_target1.eval()

        # Networks for the second agent
        # Local Actor, Local Critic, Target Actor, Target Critic
        self.actor_local2 = Actor(self.n_state, self.n_action,
                                  self.random_seed).to(self.device)
        self.actor_local2.apply(initialize_weights)
        self.critic_local2 = Critic(self.n_state * self.n_agents,
                                    self.n_action * self.n_agents,
                                    self.random_seed).to(self.device)
        self.critic_local2.apply(initialize_weights)
        self.actor_target2 = Actor(self.n_state, self.n_action,
                                   self.random_seed).to(self.device)
        self.actor_target2.apply(initialize_weights)
        self.actor_target2.eval()
        self.critic_target2 = Critic(self.n_state * self.n_agents,
                                     self.n_action * self.n_agents,
                                     self.random_seed).to(self.device)
        self.actor_target2.apply(initialize_weights)
        self.critic_target2.eval()

        # optimizers
        self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(),
                                           lr=LR_ACTOR)
        self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(),
                                           lr=LR_ACTOR)
        self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)
        self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(n_action * 2,
                             random_seed + 1,
                             mu=0.,
                             theta=THETA,
                             sigma=SIGMA)

        # Replay Buffer
        self.memory = ReplayBuffer(n_action, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed + 2, self.device)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        pass
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        # Learn, if enough samples are available in memory
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            for _ in range(N_LEARNING):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        state0 = torch.from_numpy(state[0]).unsqueeze(dim=0).float().to(
            self.device)
        state1 = torch.from_numpy(state[1]).unsqueeze(dim=0).float().to(
            self.device)

        self.actor_local1.eval()
        self.actor_local2.eval()
        with torch.no_grad():
            action0 = self.actor_local1(state0).cpu().data.numpy()
            action1 = self.actor_local2(state1).cpu().data.numpy()

        action = np.vstack([action0, action1])
        self.actor_local1.train()
        self.actor_local2.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            actions_next1 = self.actor_target1(next_states[:, 0:24])
            actions_next2 = self.actor_target2(next_states[:, 24:])

            actions_next = torch.cat((actions_next1, actions_next2), dim=1)
            Q_targets_next1 = self.critic_target1(next_states, actions_next)
            Q_targets_next2 = self.critic_target2(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets1 = rewards[:, 0].unsqueeze(
            dim=1) + (gamma * Q_targets_next1 *
                      (1 - dones[:, 0].unsqueeze(dim=1)))
        Q_targets2 = rewards[:, 1].unsqueeze(
            dim=1) + (gamma * Q_targets_next2 *
                      (1 - dones[:, 1].unsqueeze(dim=1)))

        # Compute critic loss
        Q_expected1 = self.critic_local1(states, actions)
        Q_expected2 = self.critic_local2(states, actions)

        critic_loss1 = F.mse_loss(Q_expected1, Q_targets1.detach())
        critic_loss2 = F.mse_loss(Q_expected2, Q_targets2.detach())
        # Minimize the loss
        self.critic_optimizer1.zero_grad()
        critic_loss1.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local1.parameters(), 1)
        self.critic_optimizer1.step()

        self.critic_optimizer2.zero_grad()
        critic_loss2.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local2.parameters(), 1)
        self.critic_optimizer2.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred1 = self.actor_local1(states[:, 0:24])
        actions_pred2 = self.actor_local2(states[:, 24:])
        actions_pred = torch.cat((actions_pred1, actions_pred2), dim=1)

        actor_loss1 = -self.critic_local1(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer1.zero_grad()
        actor_loss1.backward(retain_graph=True)
        self.actor_optimizer1.step()

        actor_loss2 = -self.critic_local2(states, actions_pred).mean()
        self.actor_optimizer2.zero_grad()
        actor_loss2.backward(retain_graph=True)
        self.actor_optimizer2.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local1, self.critic_target1, TAU)
        self.soft_update(self.actor_local1, self.actor_target1, TAU)
        self.soft_update(self.critic_local2, self.critic_target2, TAU)
        self.soft_update(self.actor_local2, self.actor_target2, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters

        Arguments
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 5
0
class DDPGAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 memory,
                 device='cpu',
                 params=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            memory (obj): Memory buffer to sample
            device (str): device string between cuda:0 and cpu
            params (dict): hyper-parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.step_t = 0
        self.update_every = params['update_every']

        # Set parameters
        self.gamma = params['gamma']
        self.tau = params['tau']
        self.seed = random.seed(params['seed'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, params['seed'],
                                 params['actor_units'][0],
                                 params['actor_units'][1]).to(device)
        self.actor_target = Actor(state_size, action_size, params['seed'],
                                  params['actor_units'][0],
                                  params['actor_units'][1]).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, params['seed'],
                                   params['critic_units'][0],
                                   params['critic_units'][1]).to(device)
        self.critic_target = Critic(state_size, action_size, params['seed'],
                                    params['critic_units'][0],
                                    params['critic_units'][1]).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size,
                             params['seed'],
                             theta=params['noise_theta'],
                             sigma=params['noise_sigma'])

        # Replay memory
        self.memory = memory

    def store_weights(self, filenames):
        """Store weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to store weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        torch.save(self.actor_local.state_dict(), filenames[0])
        torch.save(self.critic_local.state_dict(), filenames[1])

    def load_weights(self, filenames):
        """Load weights of Actor/Critic

        Params
        ======
            filenames (list): string of filename to load weights of actor and critic
                              filenames[0] = actor weights
                              filenames[1] = critic weights
        """
        self.actor_local.load_state_dict(torch.load(filenames[0]))
        self.critic_local.load_state_dict(torch.load(filenames[1]))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.step_t = (self.step_t + 1) % self.update_every

        # Learn, if enough samples are available in memory
        if self.step_t == 0 and len(
                self.memory) > self.memory.get_batch_size():
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 6
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.TAU = 1e-2
        self.gamma = 0.99
        self.BUFFER_SIZE = int(1e6)
        self.BATCH_SIZE = 1024
        self.LR_CRITIC = 1e-3
        self.LR_ACTOR = 1e-3
        self.WEIGHT_DECAY = 0.0
        self.EPSILON = 1.0
        self.EPSILON_DECAY = 0.99

        # Actor network (w/ target network)
        self.actor_local = Actor(self.state_size, self.action_size,
                                 seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(self.state_size, self.action_size,
                                   seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.LR_CRITIC,
                                           weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)

    def act(self, state, add_noise=True):
        """ Given a state choose an action
        Params
        ======
            state (float ndarray): state of the environment        
        """

        state = torch.from_numpy(state).unsqueeze(0).float().to(device)
        self.actor_local.eval(
        )  # set network on eval mode, this has any effect only on certain modules (Dropout, BatchNorm, etc.)
        with torch.no_grad():
            action = self.actor_local(state).cpu().squeeze(0).data.numpy()

        self.actor_local.train()  # set nework on train mode
        if add_noise:
            action += self.noise.sample() * self.EPSILON

        return np.clip(action, -1, 1)

    def reset(self):

        self.noise.reset()

    def learn(self, experiences):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        self.EPSILON *= self.EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 7
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Prioritized replay memory
        self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE,
                                                    seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # add new experience in memory
        self.prioritized_memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.prioritized_memory) >= BUFFER_SIZE:
                for _ in range(10):  # update 10 times per learning
                    idxes, experiences, is_weights = self.prioritized_memory.sample(
                        device)
                    self.learn(experiences,
                               GAMMA,
                               is_weights=is_weights,
                               leaf_idxes=idxes)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, is_weights, leaf_idxes):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Policy loss = (1/n)*Q_local(s,a) -> for deterministic policy (no log prob)

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
            is_weights (tensor array): importance-sampling weights for prioritized experience replay
            leaf_idxes (numpy array): indexes for update priorities in SumTree
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        rewards = rewards  # TODO: rewards are clipped to be in [-1,1]

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss

        Q_expected = self.critic_local(states, actions)
        td_errors = (
            Q_targets -
            Q_expected).tanh()  # TD-errors are clipped to be in [-1,1]
        abs_errors = td_errors.abs().cpu().data.numpy()  # pull back to cpu
        self.prioritized_memory.batch_update(
            leaf_idxes, abs_errors)  # update priorities in SumTree

        c_loss = (is_weights * (td_errors**2)).mean(
        )  # adjust squared TD loss by Importance-Sampling Weights
        self.running_c_loss += float(c_loss.cpu().data.numpy())
        self.training_cnt += 1

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        c_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       1)  # clip gradient to max 1
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        a_loss = self.critic_local(states, actions_pred)
        a_loss = -a_loss.mean()
        self.running_a_loss += float(a_loss.cpu().data.numpy())

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        a_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(),
                                       1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 8
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(10):  # update 10 times per learning
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.running_c_loss += float(critic_loss.cpu().data.numpy())
        self.training_cnt += 1
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.running_a_loss += float(actor_loss.cpu().data.numpy())
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, num_agents, state_size, action_size, random_seed,
                 buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic,
                 weight_decay, a_hidden_sizes, c_hidden_sizes):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.ACTOR_HL_SIZE = a_hidden_sizes
        self.CRITIC_HL_SIZE = c_hidden_sizes
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local_1 = Actor(state_size, action_size, random_seed,
                                   self.ACTOR_HL_SIZE).to(device)
        self.actor_target_1 = Actor(state_size, action_size, random_seed,
                                    self.ACTOR_HL_SIZE).to(device)
        self.actor_optimizer_1 = optim.Adam(self.actor_local_1.parameters(),
                                            lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local_1 = Critic(state_size, action_size, random_seed,
                                     self.CRITIC_HL_SIZE).to(device)
        self.critic_target_1 = Critic(state_size, action_size, random_seed,
                                      self.CRITIC_HL_SIZE).to(device)
        self.critic_optimizer_1 = optim.Adam(self.critic_local_1.parameters(),
                                             lr=self.LR_CRITIC,
                                             weight_decay=self.WEIGHT_DECAY)

        # Actor Network (w/ Target Network)
        self.actor_local_2 = Actor(state_size, action_size, random_seed,
                                   self.ACTOR_HL_SIZE).to(device)
        self.actor_target_2 = Actor(state_size, action_size, random_seed,
                                    self.ACTOR_HL_SIZE).to(device)
        self.actor_optimizer_2 = optim.Adam(self.actor_local_2.parameters(),
                                            lr=self.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local_2 = Critic(state_size, action_size, random_seed,
                                     self.CRITIC_HL_SIZE).to(device)
        self.critic_target_2 = Critic(state_size, action_size, random_seed,
                                      self.CRITIC_HL_SIZE).to(device)
        self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
                                             lr=self.LR_CRITIC,
                                             weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(states.shape[0]):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if len(self.memory) > self.BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, self.GAMMA)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local_1.eval()
        self.actor_local_2.eval()
        action_values = [states.shape[0], self.action_size]
        with torch.no_grad():
            action_values[0] = self.actor_local_1(states[0]).cpu().data.numpy()
            action_values[1] = self.actor_local_2(states[1]).cpu().data.numpy()
        self.actor_local_1.train()
        self.actor_local_2.train()

        #print (action_values)
        if add_noise:
            action_values += self.noise.sample()
        #print (action_values)
        #print (np.clip(action_values, -1, 1))
        return np.clip(action_values, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next_1 = self.actor_target_1(next_states)
        actions_next_2 = self.actor_target_2(next_states)
        Q_targets_next_1 = self.critic_target_1(next_states,
                                                actions_next_1.detach())
        Q_targets_next_2 = self.critic_target_2(next_states,
                                                actions_next_2.detach())
        # Compute Q targets for current states (y_i)
        Q_targets_1 = rewards + (gamma * Q_targets_next_1 * (1 - dones))
        Q_targets_2 = rewards + (gamma * Q_targets_next_2 * (1 - dones))
        # Compute critic loss
        Q_expected_1 = self.critic_local_1(states, actions)
        Q_expected_2 = self.critic_local_2(states, actions)
        critic_loss_1 = F.mse_loss(Q_expected_1, Q_targets_1.detach())
        critic_loss_2 = F.mse_loss(Q_expected_2, Q_targets_2.detach())
        # Minimize the loss
        self.critic_optimizer_1.zero_grad()
        self.critic_optimizer_2.zero_grad()
        critic_loss_1.backward()
        critic_loss_2.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer_1.step()
        self.critic_optimizer_2.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred_1 = self.actor_local_1(states)
        actions_pred_2 = self.actor_local_2(states)
        actor_loss_1 = -self.critic_local_1(states, actions_pred_1).mean()
        actor_loss_2 = -self.critic_local_2(states, actions_pred_2).mean()
        # Minimize the loss
        self.actor_optimizer_1.zero_grad()
        self.actor_optimizer_2.zero_grad()
        actor_loss_1.backward()
        actor_loss_2.backward()
        self.actor_optimizer_1.step()
        self.actor_optimizer_2.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local_1, self.critic_target_1, self.TAU)
        self.soft_update(self.critic_local_2, self.critic_target_2, self.TAU)
        self.soft_update(self.actor_local_1, self.actor_target_1, self.TAU)
        self.soft_update(self.actor_local_2, self.actor_target_2, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 10
0
class Agent:
    def __init__(self, state_size=OBS_DIM, action_size=ACT_DIM, random_seed=0):
        """Initialize an Agent object.
        Params
        =====
            state_size (int): dimension of all observation
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        self.noise = OUNoise(action_size, random_seed)
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, state, action, reward, next_state):
        """Save an experience in replay buffer and use random samples from buffer to learn."""
        self.memory.add(state, action, reward, next_state)

        if len(self.memory
               ) > BATCH_SIZE:  # begin to learn when replay buffer is full
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Return actions for given state as per current policy."""
        state = state[None, :]
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples
        Q_target = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q_value

        Params
        =====
            experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states = experiences

        # ----------------- update critic network weights ---------------- #
        # get predicted next_state actions and Q_values from target models
        actions_next = self.actor_target(next_states)
        q_targets_next = self.critic_target(next_states, actions_next)
        # compute Q targets for current states
        q_targets = rewards + gamma * q_targets_next
        # compute critic loss
        q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(q_expected, q_targets)
        # minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------- update actor network weights ---------------- #
        # compute the loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------- update target networks ------------------ #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ----------------- update noise -------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters
        θ_target = τ * θ_local + (1 - τ) * θ_target
        Params
        =====
            local_model: Network weights to be copied from
            target_model: Network weights to be copied to
            tau(float): interpolation parameter
        """
        for local_param, target_param in zip(local_model.parameters(),
                                             target_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def restore(self, save_path):
        actor_checkpoint = torch.load(save_path + '/checkpoint_actor.pth')
        self.actor_local.load_state_dict(actor_checkpoint)
        critic_checkpoint = torch.load(save_path + '/checkpoint_critic.pth')
        self.actor_local.load_state_dict(critic_checkpoint)
        print('Successfully load network weights!')
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 brain_name,
                 seed,
                 params=default_params,
                 device=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        params = self._fill_params(params)

        # implementation and identity
        self.device = device if device is not None else torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.name = params['name']
        self.brain_name = brain_name

        # set environment information
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 seed,
                                 fc1_units=params['layers_actor'][0],
                                 fc2_units=params['layers_actor'][1]).to(
                                     self.device)

        self.actor_target = Actor(state_size,
                                  action_size,
                                  seed,
                                  fc1_units=params['layers_actor'][0],
                                  fc2_units=params['layers_actor'][1]).to(
                                      self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=params['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   seed,
                                   fcs1_units=params['layers_critic'][0],
                                   fc2_units=params['layers_critic'][1]).to(
                                       self.device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    seed,
                                    fcs1_units=params['layers_critic'][0],
                                    fc2_units=params['layers_critic'][1]).to(
                                        self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['lr_critic'],
                                           weight_decay=params['weight_decay'])

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   params['buffer_size'],
                                   params['batch_size'],
                                   seed,
                                   device=self.device)

        # save params
        self.params = params

    def _fill_params(self, src_params):
        params = {
            'name':
            self._get_param_or_default('name', src_params, default_params),
            'buffer_size':
            self._get_param_or_default('buffer_size', src_params,
                                       default_params),
            'batch_size':
            self._get_param_or_default('batch_size', src_params,
                                       default_params),
            'layers_actor':
            self._get_param_or_default('layers_actor', src_params,
                                       default_params),
            'layers_critic':
            self._get_param_or_default('layers_critic', src_params,
                                       default_params),
            'lr_actor':
            self._get_param_or_default('lr_actor', src_params, default_params),
            'lr_critic':
            self._get_param_or_default('lr_critic', src_params,
                                       default_params),
            'gamma':
            self._get_param_or_default('gamma', src_params, default_params),
            'tau':
            self._get_param_or_default('tau', src_params, default_params),
            'weight_decay':
            self._get_param_or_default('weight_decay', src_params,
                                       default_params)
        }
        return params

    def display_params(self, force_print=False):
        if force_print:
            print(self.params)
        return self.params

    def _get_param_or_default(self, key, src_params, default_params):
        if key in src_params:
            return src_params[key]
        else:
            return default_params[key]

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def start_learn(self):
        # Learn, if enough samples are available in memory
        # decoupled from step method to allow multiple steps per learning pass
        if len(self.memory) > self.params['batch_size']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['gamma'])

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.params['tau'])
        self.soft_update(self.actor_local, self.actor_target,
                         self.params['tau'])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 12
0
class DDPGAgent:
    def __init__(self,
                 action_size=4,
                 state_size=33,
                 num_agents=20,
                 max_steps=1000,
                 seed=0,
                 train_mode=True):
        self.train_mode = train_mode
        self.action_size = action_size
        self.state_size = state_size
        self.num_agents = num_agents
        self.max_steps = max_steps

        self.step_count = 0
        self.scores = np.zeros(self.num_agents)
        self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None

        self.noise = OUNoise(self.action_size, seed)
        self.memory = AgentMemory(batch_size=BATCH_SIZE,
                                  buffer_size=MEMORY_BUFFER,
                                  seed=seed)

        self.actor = Actor(self.state_size, self.action_size, seed)
        self.critic = Critic(self.state_size, self.action_size, seed)

        self.target_actor = Actor(self.state_size, self.action_size, seed)
        self.target_critic = Critic(self.state_size, self.action_size, seed)

        self.actor_opt = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        hard_update(self.actor, self.target_actor)
        hard_update(self.critic, self.target_critic)

    def reset(self):
        self.noise.reset()
        self.step_count = 0
        self.scores = np.zeros(self.num_agents)
        self.states, self.actions, self.rewards, self.next_states, self.dones = None, None, None, None, None

    def step(self):
        self.scores += np.array(self.rewards)
        self.step_count += 1
        self.memory.add(self.states, self.actions, self.rewards,
                        self.next_states, self.dones)

        if self.memory.has_enough_memory():
            for i in range(UPDATE_FREQUENCY_PER_STEP):
                states, actions, rewards, next_states, dones = self.memory.sample(
                )
                self.learn(states, actions, rewards, next_states, dones)
                self.soft_update()

    def act(self, add_noise=True):
        states = array_to_tensor(self.states)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states)
            actions = actions.cpu().data.numpy()
        self.actor.train()

        if add_noise:
            noise = self.noise.sample()
            actions += noise

        actions = np.clip(actions, -1, 1)
        return actions

    def learn(self, states, actions, rewards, next_states, dones):
        # Update critic
        self.critic_opt.zero_grad()
        critic_loss = ddpg_compute_critic_loss(states, actions, rewards,
                                               next_states, dones,
                                               self.target_actor,
                                               self.target_critic, self.critic)
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_opt.step()

        # Update actor
        self.actor_opt.zero_grad()
        actor_loss = ddpg_compute_actor_loss(states, self.actor, self.critic)
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        self.actor_opt.step()

        # Update target nets
        self.soft_update()

    def soft_update(self):
        soft_update(self.actor, self.target_actor, TAU)
        soft_update(self.critic, self.target_critic, TAU)