Ejemplo n.º 1
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.MU = 0
        self.THETA = 0.15
        self.SIGMA = 0.10
        self.GAMMA = 0.99
        self.TAU = 0.001
        self.BATCHS = 256
        self.MAX_REWARD = -999999999

        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        self.noiseObj = Noise(self.action_size, self.MU, self.THETA,
                              self.SIGMA)
        self.replayObj = Replay(self.BATCHS)
Ejemplo n.º 2
0
Archivo: agent.py Proyecto: shintay/rl
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size,
                                   self.action_size,
                                   self.action_low,
                                   self.action_high)

        self.actor_target = Actor(self.state_size,
                                    self.action_size,
                                    self.action_low,
                                    self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  #0
        self.exploration_theta = 0.125 # 0.14 | 0.1
        self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001
        self.noise = OUNoise(self.action_size,
                             self.exploration_mu,
                             self.exploration_theta,
                             self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size,
                                   self.batch_size)

        # Algorithm parameters
        self.gamma = 0.998  # 0.99 | 0.9 | discount factor
        self.tau = 0.099  # 0.001| 0.01 | 0.1 | 0.05 |  for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.score = 0
Ejemplo n.º 3
0
    def __init__(self,state_size, action_size,single_rotor_control=False, buffer_size=int(1e5), batch_size=128, 
                    gamma=0.98, tau=1e-3, lr_actor=1e-4,lr_critic=1e-3, random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # rotor control mode
        self.single_rotor_control = single_rotor_control

        if self.single_rotor_control:
            action_size = 1

        # define hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(buffer_size, random_seed)

        # counter for time steps
        self.time_step = 0
Ejemplo n.º 4
0
    def __init__(self, task, gym=False):
        self.task = task

        if gym:
            self.state_size = np.prod(task.observation_space.shape)
            self.action_size = np.prod(task.action_space.shape)
            self.action_low = task.action_space.low
            self.action_high = task.action_space.high

        else:
            self.state_size = task.state_size
            self.action_size = task.action_size
            self.action_low = task.action_low
            self.action_high = task.action_high
            self.alpha = task.alpha
            self.beta = task.beta

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.alpha)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.alpha)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.beta)
        self.critic_target = Critic(self.state_size, self.action_size, self.beta)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0                 # changed from 0
        self.exploration_theta = 0.15           # changed from 0.15
        self.exploration_sigma = 3.2            # changed from 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000        # was 100000 originally
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor, 0.99
        self.tau = .001  # for soft update of target parameters, 0.01 originally
Ejemplo n.º 5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.25  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 16
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
Ejemplo n.º 6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.9  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        self.total_reward = 0
        self.count = 0
        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.total_reward = 0
        self.count = 0
        self.score = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        self.count += 1
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(states)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        self.score = self.total_reward / float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 7
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self,state_size, action_size,single_rotor_control=False, buffer_size=int(1e5), batch_size=128, 
                    gamma=0.98, tau=1e-3, lr_actor=1e-4,lr_critic=1e-3, random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # rotor control mode
        self.single_rotor_control = single_rotor_control

        if self.single_rotor_control:
            action_size = 1

        # define hyperparameters
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(size=action_size, seed=random_seed)

        # Replay memory
        self.memory = ReplayBuffer(buffer_size, random_seed)

        # counter for time steps
        self.time_step = 0

        #self.soft_update(self.critic_local, self.critic_target, 1.0)
        #self.soft_update(self.actor_local, self.actor_target, 1.0)
    
    def step(self,state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        
        # Save experience / reward
        if self.single_rotor_control:
            self.memory.add(state, action[0], reward, next_state, done)
        else:
            self.memory.add(state, action, reward, next_state, done)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences, self.gamma)

        # increase time step count
        self.time_step+=1

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states.unsqueeze(0)).cpu().data.numpy()[0]
        self.actor_local.train()
        if add_noise:
            #noise = np.repeat(,self.action_size)
            actions += self.noise.sample()

        if self.single_rotor_control:
            actions = np.repeat(actions,self.action_size)

        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        states, actions, rewards, next_states, dones, idxs, is_weights = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute Q value predictions
        Q_expected = self.critic_local(states, actions)

        # compute td error
        td_error = Q_targets - Q_expected
        # update td error in Replay buffer
        self.memory.update_priorities(idxs,td_error.detach().cpu().numpy().squeeze())

        # compute critic loss
        critic_loss = ((is_weights*td_error)**2).mean()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -(is_weights * self.critic_local(states, actions_pred).squeeze()).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 8
0
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor Policy Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic Value Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15  #0.15
        self.exploration_sigma = 0.25  #0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        #Replay Memory
        self.buffer_size = 100000
        self.batch_size = 16
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99
        self.tau = 0.01

    def reset_episode(self):
        #reset of the episode
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        #Save experience/reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        #last state and action
        self.last_state = next_state

    def act(self, state):
        #Returns actions for given state(s) as per current policy
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        #add some noise for exploration
        return list(action + self.noise.sample())

    def learn(self, experiences):
        #Update policy and value parameters using given batch of experience tuples

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        #Get predicted next-state actions and Q values from target models
        #Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        #Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        #Train actor model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))

        #Training function
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):

        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        #Local model parameters and target model parameters should have the same size
        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 9
0
class Agent(object):
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        #Parames
        self.MU = 0
        self.THETA = 0.15
        self.SIGMA = 0.10
        self.GAMMA = 0.99
        self.TAU = 0.001
        self.BATCHS = 256
        self.MAX_REWARD = -999999999

        #Actor Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        #init
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        #Critic Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        #init
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        #Noise process
        self.noiseObj = Noise(self.action_size, self.MU, self.THETA,
                              self.SIGMA)

        #Replay memory
        self.replayObj = Replay(self.BATCHS)

    def reset_episode(self):
        self.count = 0
        self.total_reward = 0
        self.noiseObj.reset()
        state = self.task.reset()
        self.last_state = state
        return (state)

    def step(self, action, reward, next_state, done):
        self.replayObj.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1
        if self.total_reward > self.MAX_REWARD:
            self.MAX_REWARD = self.total_reward

        if len(self.replayObj) > self.BATCHS:
            experiences = self.replayObj.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        action = self.actor_local.model.predict(
            np.reshape(states, [-1, self.state_size]))[0]
        return (list(action + self.noiseObj.sample()))

    def learn(self, experiences):
        states = np.array([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences
                            if e is not None]).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).reshape(-1, 1)

        next_state = np.array(
            [e.next_state for e in experiences if e is not None])
        #获取预测next_state的actions 和目标模型的Q值
        next_actions = self.actor_target.model.predict_on_batch(next_state)

        next_Q_targets = self.critic_target.model.predict_on_batch(
            [next_state, next_actions])
        Q_targets = rewards + self.GAMMA * next_Q_targets * (1 - dones)

        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)
        #训练actor 模型
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)

    def update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.TAU * local_weights + (1 -
                                                  self.TAU) * target_weights
        target_model.set_weights(new_weights)
Ejemplo n.º 10
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 seed=23520,
                 GRADIENT_CLIP=1,
                 ACTIVATION=F.relu,
                 BOOTSTRAP_SIZE=5,
                 GAMMA=0.99,
                 TAU=1e-3,
                 LR_CRITIC=5e-4,
                 LR_ACTOR=5e-4,
                 UPDATE_EVERY=1,
                 TRANSFER_EVERY=2,
                 UPDATE_LOOP=10,
                 ADD_NOISE_EVERY=5,
                 WEIGHT_DECAY=0,
                 MEMORY_SIZE=5e4,
                 BATCH_SIZE=64):
        """Initialize an Agent object.
        
        Params
        ======
            state_size  : dimension of each state
            action_size : dimension of each action
            num_agents  : number of running agents
            device: cpu or cuda:0 if available
            -----These are hyperparameters----
            BOOTSTRAP_SIZE      : How far ahead to bootstrap
            GAMMA               : Discount factor
            TAU                 : Parameter for performing soft updates of target parameters
            LR_CRITIC, LR_ACTOR : Learning rate of the networks
            UPDATE_EVERY        : How often to update the networks
            TRANSFER_EVERY      : How often to transfer the weights from local to target
            UPDATE_LOOP         : Number of iterations for network update
            ADD_NOISE_EVERY     : How often to add noise to favor exploration
            WEIGHT_DECAY        : L2 weight decay for critic optimizer
            GRADIENT_CLIP       : Limit of gradient to be clipped, to avoid exploding gradient issue
        """

        # Actor networks
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),
                                      lr=LR_ACTOR,
                                      weight_decay=WEIGHT_DECAY)
        hard_update(self.actor_local, self.actor_target)

        #critic networks
        self.critic_local = Critic(state_size * 2, action_size).to(device)
        self.critic_target = Critic(state_size * 2, action_size).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(),
                                       lr=LR_CRITIC,
                                       weight_decay=WEIGHT_DECAY)
        hard_update(self.critic_local, self.critic_target)

        self.device = device
        self.num_agents = num_agents

        # Noise : using simple noise instead of OUNoise
        self.noise = [
            SimpleNoise(action_size, scale=1) for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE),
                                   BATCH_SIZE, seed)

        # Initialize time steps (for updating every UPDATE_EVERY steps)
        self.u_step = 0
        self.n_step = 0

        #keeping hyperparameters within the instance
        self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE
        self.GAMMA = GAMMA
        self.TAU = TAU
        self.LR_CRITIC = LR_CRITIC
        self.LR_ACTOR = LR_ACTOR
        self.UPDATE_EVERY = UPDATE_EVERY
        self.TRANSFER_EVERY = TRANSFER_EVERY
        self.UPDATE_LOOP = UPDATE_LOOP
        self.ADD_NOISE_EVERY = ADD_NOISE_EVERY
        self.GRADIENT_CLIP = GRADIENT_CLIP

        # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size
        self.rewards = deque(maxlen=BOOTSTRAP_SIZE)
        self.states = deque(maxlen=BOOTSTRAP_SIZE)
        self.actions = deque(maxlen=BOOTSTRAP_SIZE)
        self.gammas = np.array([[GAMMA**i for j in range(num_agents)]
                                for i in range(BOOTSTRAP_SIZE)])

        self.loss_function = torch.nn.SmoothL1Loss()
Ejemplo n.º 11
0
class MADDPG():
    """
    Class definition of MADDPG agent. Interacts with and learns from the environment
    Comprises of a pair of Actor-Critic network ad implements centralized training and decentralized exeution (learn function)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 device,
                 seed=23520,
                 GRADIENT_CLIP=1,
                 ACTIVATION=F.relu,
                 BOOTSTRAP_SIZE=5,
                 GAMMA=0.99,
                 TAU=1e-3,
                 LR_CRITIC=5e-4,
                 LR_ACTOR=5e-4,
                 UPDATE_EVERY=1,
                 TRANSFER_EVERY=2,
                 UPDATE_LOOP=10,
                 ADD_NOISE_EVERY=5,
                 WEIGHT_DECAY=0,
                 MEMORY_SIZE=5e4,
                 BATCH_SIZE=64):
        """Initialize an Agent object.
        
        Params
        ======
            state_size  : dimension of each state
            action_size : dimension of each action
            num_agents  : number of running agents
            device: cpu or cuda:0 if available
            -----These are hyperparameters----
            BOOTSTRAP_SIZE      : How far ahead to bootstrap
            GAMMA               : Discount factor
            TAU                 : Parameter for performing soft updates of target parameters
            LR_CRITIC, LR_ACTOR : Learning rate of the networks
            UPDATE_EVERY        : How often to update the networks
            TRANSFER_EVERY      : How often to transfer the weights from local to target
            UPDATE_LOOP         : Number of iterations for network update
            ADD_NOISE_EVERY     : How often to add noise to favor exploration
            WEIGHT_DECAY        : L2 weight decay for critic optimizer
            GRADIENT_CLIP       : Limit of gradient to be clipped, to avoid exploding gradient issue
        """

        # Actor networks
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),
                                      lr=LR_ACTOR,
                                      weight_decay=WEIGHT_DECAY)
        hard_update(self.actor_local, self.actor_target)

        #critic networks
        self.critic_local = Critic(state_size * 2, action_size).to(device)
        self.critic_target = Critic(state_size * 2, action_size).to(device)
        self.critic_optim = optim.Adam(self.critic_local.parameters(),
                                       lr=LR_CRITIC,
                                       weight_decay=WEIGHT_DECAY)
        hard_update(self.critic_local, self.critic_target)

        self.device = device
        self.num_agents = num_agents

        # Noise : using simple noise instead of OUNoise
        self.noise = [
            SimpleNoise(action_size, scale=1) for i in range(num_agents)
        ]

        # Replay memory
        self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE),
                                   BATCH_SIZE, seed)

        # Initialize time steps (for updating every UPDATE_EVERY steps)
        self.u_step = 0
        self.n_step = 0

        #keeping hyperparameters within the instance
        self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE
        self.GAMMA = GAMMA
        self.TAU = TAU
        self.LR_CRITIC = LR_CRITIC
        self.LR_ACTOR = LR_ACTOR
        self.UPDATE_EVERY = UPDATE_EVERY
        self.TRANSFER_EVERY = TRANSFER_EVERY
        self.UPDATE_LOOP = UPDATE_LOOP
        self.ADD_NOISE_EVERY = ADD_NOISE_EVERY
        self.GRADIENT_CLIP = GRADIENT_CLIP

        # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size
        self.rewards = deque(maxlen=BOOTSTRAP_SIZE)
        self.states = deque(maxlen=BOOTSTRAP_SIZE)
        self.actions = deque(maxlen=BOOTSTRAP_SIZE)
        self.gammas = np.array([[GAMMA**i for j in range(num_agents)]
                                for i in range(BOOTSTRAP_SIZE)])

        self.loss_function = torch.nn.SmoothL1Loss()

    def reset(self):
        if self.noise:
            for n in self.noise:
                n.reset()

    def set_noise(self, noise):
        self.noise = noise

    def save(self, filename):
        torch.save(self.actor_local.state_dict(),
                   "{}_actor_local.pth".format(filename))
        torch.save(self.actor_target.state_dict(),
                   "{}_actor_target.pth".format(filename))
        torch.save(self.critic_local.state_dict(),
                   "{}_critic_local.pth".format(filename))
        torch.save(self.critic_target.state_dict(),
                   "{}_critic_target.pth".format(filename))

    def load(self, path):
        self.actor_local.load_state_dict(torch.load(path + "_actor_local.pth"))
        self.actor_target.load_state_dict(
            torch.load(path + "_actor_target.pth"))
        self.critic_local.load_state_dict(
            torch.load(path + "_critic_local.pth"))
        self.critic_target.load_state_dict(
            torch.load(path + "_critic_target.pth"))

    def act(self, states, noise=0.0):
        """
        Returns actions of each actor for given states.
        
        Params
        ======
            state    : current states
            add_noise: Introduce some noise in agent's action or not. During training, this is necessary to promote the exploration but should not be used during validation
        """
        actions = None

        self.n_step = (self.n_step + 1) % self.ADD_NOISE_EVERY

        with torch.no_grad():
            self.actor_local.eval()
            states = torch.from_numpy(states).float().unsqueeze(0).to(
                self.device)
            actions = self.actor_local(states).squeeze().cpu().data.numpy()
            self.actor_local.train()
            if self.n_step == 0:
                for i in range(len(actions)):
                    actions[i] += noise * self.noise[i].sample()
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        """
        Take a step for the current episode
        1. Save the experience
        2. Bootstrap the rewards
        3. If update conditions are statisfied, perform learning on required number of loops
        """

        # Save experience in replay memory
        self.rewards.append(rewards)
        self.states.append(states)
        self.actions.append(actions)

        if len(self.rewards) == self.BOOTSTRAP_SIZE:
            # get the bootstrapped sum of rewards per agents
            reward = np.sum(self.rewards * self.gammas, axis=-2)
            self.memory.add(self.states[0], self.actions[0], reward,
                            next_states, dones)

        if np.any(dones):
            self.rewards.clear()
            self.states.clear()
            self.actions.clear()

        # Learn every UPDATE_EVERY timesteps
        self.u_step = (self.u_step + 1) % self.UPDATE_EVERY

        t_step = 0
        if len(self.memory) > self.memory.batch_size and self.u_step == 0:
            for _ in range(self.UPDATE_LOOP):
                self.learn()
                # transfer the weights as specified
                t_step = (t_step + 1) % self.TRANSFER_EVERY
                if t_step == 0:
                    soft_update(self.actor_local, self.actor_target, self.TAU)
                    soft_update(self.critic_local, self.critic_target,
                                self.TAU)

    def transform_states(self, states):
        """
        Transforms states to full states so that both agents can see each others state via the critic network
        """
        batch_size = states.shape[0]
        state_size = states.shape[-1]
        num_agents = states.shape[-2]
        transformed_states = torch.zeros(
            (batch_size, num_agents, state_size * num_agents)).to(self.device)
        for i in range(num_agents):
            start = 0
            for j in range(num_agents):
                transformed_states[:, i, start:start + state_size] += states[:,
                                                                             j]
                start += state_size
        return transformed_states

    def learn(self):
        """
        Update the network parameters using the experiences. The algorithm is described in detail in readme

        Params
        ======
            experiences : List of (s, a, r, s', done) tuples
        """
        # sample the memory to disrupt the internal correlation
        states, actions, rewards, next_states, dones = self.memory.sample()
        full_states = self.transform_states(states)

        # The critic should estimate the value of the states to be equal to rewards plus
        # the estimation of the next_states value according to the critic_target and actor_target
        with torch.no_grad():
            self.actor_target.eval()
            self.critic_target.eval()
            # obtain next actions as given by the target network and get transformed states for the critic
            next_actions = self.actor_target(next_states)
            next_full_states = self.transform_states(next_states)
            # calculate Q value using transformed next states and next actions, basically predict what the next value is from target's perspective
            q_next = self.critic_target(next_full_states,
                                        next_actions).squeeze(-1)
            # calculate the target's value
            targeted_value = rewards + (
                self.GAMMA**self.BOOTSTRAP_SIZE) * q_next * (1 - dones)

        current_value = self.critic_local(full_states, actions).squeeze(-1)
        loss = self.loss_function(current_value, targeted_value)
        # During the optimization, the critic tells how much the value is off from the action value and adjusts the network towards. Basically, the critic takes the actions predicted by the actor, and tells how good or bad they are by calculating its Q-value

        # calculate the loss of the critic network and backpropagate
        self.critic_optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.GRADIENT_CLIP)
        self.critic_optim.step()

        # optimize the actor by having the critic evaluating the value of the actor's decision
        self.actor_optim.zero_grad()
        actions_pred = self.actor_local(states)
        mean = self.critic_local(full_states, actions_pred).mean()
        (-mean).backward()
        self.actor_optim.step()