Ejemplo n.º 1
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agents=None,
                 new_hyperparameters=None,
                 seed=0,
                 device="cpu",
                 model_output_dir=None,
                 enable_logger=False,
                 logger_path=None,
                 logger_comment=None,
                 opt_soft_update=False):
        """Initialize a MADDPGAgent wrapper.
       
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): the number of agents in the environment
        """
        raise NotImplementedError()

        super(DDPG, self).__init__(
            new_hyperparameters=new_hyperparameters,
            enable_logger=enable_logger,
            logger_path=logger_path,
            logger_comment=logger_comment
        )

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if agents:
            self.agents = agents
        else:
            self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)]

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir
Ejemplo n.º 2
0
class ReplayBufferTest(unittest.TestCase):
    def setUp(self):
        self.batch_size = 2
        self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu")
        self.populate_replay_buffer()

    def populate_replay_buffer(self, n=5):
        for _ in range(n):
            self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0)

    def test_add(self):
        l1 = len(self.replay_buffer)
        self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0)
        l2 = len(self.replay_buffer)
        self.assertNotEqual(l1, l2)

    def test_sample(self):
        s, a, r, ns, d = self.replay_buffer.sample()
        self.assertEqual(s.shape[0], self.batch_size)
        self.assertEqual(a.shape[0], self.batch_size)
        self.assertEqual(r.shape[0], self.batch_size)
        self.assertEqual(ns.shape[0], self.batch_size)
        self.assertEqual(d.shape[0], self.batch_size)
Ejemplo n.º 3
0
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 qnetwork_local=None,
                 qnetwork_target=None,
                 optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 opt_soft_update: bool = False,
                 opt_ddqn: bool = False):
        """Initialize an DQNAgent object.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            qnetwork_local (torch.nn.Module): Local Q-Network model.
            qnetwork_target (torch.nn.Module): Target Q-Network model.
            optimizer (torch.optim): Local Q-Network optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.
            opt_ddqn (bool): Use Double DQN for `expected_Q`.
        
        Returns:
            An instance of DQNAgent.
        """
        super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if qnetwork_local:
            self.qnetwork_local = qnetwork_local
        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size).to(self.device)

        if qnetwork_target:
            self.qnetwork_target = qnetwork_target
        else:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size).to(self.device)

        if optimizer:
            self.optimizer = optimizer
        else:
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.LEARNING_RATE)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update
        self.opt_ddqn = opt_ddqn

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.qnetwork_local, "qnetwork_local_params"),
            (self.optimizer, "optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.qnetwork_local, self.qnetwork_target)
Ejemplo n.º 4
0
class DQNAgent(Agent):
    """DQN Agent implementation."""

    # TODO: Consider how to extend this to accept multiple agents?
    # TODO: Add noise to DQN?

    # TODO: Ensure that this cannot be changed in other ways
    # TODO: Look up original value for these params
    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e7),
        "batch_size": 32,
        "gamma": 0.99,
        "learning_rate": 2.5e-4,
        "tau": 1e-3,
        "learn_every": 4,
        "hard_update_every": 10000
    }

    ALGORITHM = "DQN"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 qnetwork_local=None,
                 qnetwork_target=None,
                 optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 opt_soft_update: bool = False,
                 opt_ddqn: bool = False):
        """Initialize an DQNAgent object.

        Args:
            state_size (int): Dimension of each state.
            action_size (int): Dimension of each action.
            qnetwork_local (torch.nn.Module): Local Q-Network model.
            qnetwork_target (torch.nn.Module): Target Q-Network model.
            optimizer (torch.optim): Local Q-Network optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.
            opt_ddqn (bool): Use Double DQN for `expected_Q`.
        
        Returns:
            An instance of DQNAgent.
        """
        super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if qnetwork_local:
            self.qnetwork_local = qnetwork_local
        else:
            self.qnetwork_local = QNetwork(state_size,
                                           action_size).to(self.device)

        if qnetwork_target:
            self.qnetwork_target = qnetwork_target
        else:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size).to(self.device)

        if optimizer:
            self.optimizer = optimizer
        else:
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=self.LEARNING_RATE)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update
        self.opt_ddqn = opt_ddqn

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.qnetwork_local, "qnetwork_local_params"),
            (self.optimizer, "optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.qnetwork_local, self.qnetwork_target)

    def __str__(self) -> str:
        """Helper to output network architecture for the agent.
        
        Returns:
            A string representation of this algorithm.
        """
        return ("{}\n{}\n{}\n{}".format("Q-Network (Local):",
                                        self.qnetwork_local,
                                        "Q-Network (Target):",
                                        self.qnetwork_target))

    def origin(self) -> str:
        """Helper to get the original paper for this algorithm.

        Returns: 
            The original paper for this algorithm.
        """
        return 'https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf'

    def description(self) -> str:
        """Helper to get a brief description of this algorithm.

        Returns:
            A brief description of this algorithm.
        """
        description = (
            'DQN is an algorithm created by DeepMind that brings together the power '
            'of the Q-Learning algorithm with the advantages of generalization through '
            'function approximation. It uses a deep neural network to estimate a Q-value '
            'function. As such, the input to the network is the current state of the '
            'environment, and the output is the Q-value for each possible action.'
        )
        return description

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             logger=None) -> None:
        """Saves experience to replay memory and updates model weights.

        Args:
            state: Environment states.
            action: Environment actions.
            reward: Rewards for the actions above.
            next_state: Next environment states.
            done (bool): Boolean indicating if the environment has terminated. 
            logger (Logger): An instance of Logger.
        """
        self.memory.add(state, action, reward, next_state, done)

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, logger=logger)

    def act(self, state, eps=0.0, add_noise=False, logger=None):
        """Returns actions for given state as per current policy.

        Args:
            state: The current state of the environment.
            eps (float): Epsilon, for Epsilon-greedy action selection.
            add_noise (bool): Controls addition of noise.
            logger (Logger): An instance of Logger.

        Returns: 
            Actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, logger=None) -> None:
        """Updates value parameters using given batch of experience tuples.

        Args:
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples.
            logger (Logger): An instance of Logger.
        """
        states, actions, rewards, next_states, dones = experiences

        if self.opt_ddqn:
            # Double DQN
            non_final_next_states = next_states * (1 - dones)
            # Get the actions themselves, not their output value
            _, next_state_actions = self.qnetwork_local(
                non_final_next_states).max(1, keepdim=True)
            next_Q_targets = self.qnetwork_target(
                non_final_next_states).gather(1, next_state_actions)
            target_Q = rewards + (self.GAMMA * next_Q_targets * (1 - dones))
        else:
            # Vanilla DQN
            next_max_a = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
            target_Q = rewards + (self.GAMMA * next_max_a * (1 - dones))

        expected_Q = self.qnetwork_local(states)
        if len(actions.shape) == 1:
            actions = actions.unsqueeze(1)
        expected_Q = torch.gather(expected_Q, 1, actions.long())

        # Compute and minimize the loss
        loss = F.mse_loss(expected_Q, target_Q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if self.opt_soft_update:
            soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU)
        elif self.time_step % self.HARD_UPDATE_EVERY == 0:
            hard_update(self.qnetwork_local, self.qnetwork_target)

        if logger:
            loss = loss.cpu().detach().item()
            logger.add_scalar('loss', loss, self.time_step)
Ejemplo n.º 5
0
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 num_agents: int,
                 actor_local=None,
                 actor_target=None,
                 actor_optimizer=None,
                 critic_local=None,
                 critic_target=None,
                 critic_optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 enable_logger: bool = False,
                 logger_path: str = None,
                 logger_comment: str = None,
                 opt_soft_update: bool = False):
        """Initialize an DDPGAgent object.

        Args:
            state_size (int): dimension of each state.
            action_size (int): dimension of each action.
            num_agents (int): number of agents in the environment.
            actor_local (torch.nn.Module): Local Actor model.
            actor_target (torch.nn.Module): Target Actor model.
            actor_optimizer (torch.optim): Actor optimizer.
            critic_local (torch.nn.Module): Local Critic model.
            critic_target (torch.nn.Module): Target Critic model.
            critic_optimizer (torch.optim): Critic optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.

        Returns:
            An instance of DDPGAgent.
        """
        super(DDPGAgent,
              self).__init__(new_hyperparameters=new_hyperparameters,
                             enable_logger=enable_logger,
                             logger_path=logger_path,
                             logger_comment=logger_comment)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = actor_local if actor_local else Actor(
            state_size, action_size, seed).to(device)
        self.actor_target = actor_target if actor_target else Actor(
            state_size, action_size, seed).to(device)
        self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam(
            self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = critic_local if critic_local else Critic(
            state_size, action_size, seed).to(device)
        self.critic_target = critic_target if critic_target else Critic(
            state_size, action_size, seed).to(device)
        self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam(
            self.critic_local.parameters(),
            lr=self.LEARNING_RATE_CRITIC,
            weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.actor_local, "actor_local_params"),
            (self.actor_optimizer, "actor_optimizer_params"),
            (self.critic_local, "critic_local_params"),
            (self.critic_optimizer, "critic_optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target)
Ejemplo n.º 6
0
class DDPGAgent(Agent):
    """DDPG Agent implementation."""

    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e6),
        "batch_size": 64,
        "gamma": 0.99,
        "tau": 1e-3,
        "learning_rate_actor": 1e-4,
        "learning_rate_critic": 1e-3,
        "weight_decay": 1e-2,
        "learn_every": 4,
        "hard_update_every": 4
    }

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 num_agents: int,
                 actor_local=None,
                 actor_target=None,
                 actor_optimizer=None,
                 critic_local=None,
                 critic_target=None,
                 critic_optimizer=None,
                 new_hyperparameters=None,
                 seed: int = 0,
                 device: str = "cpu",
                 model_output_dir: str = None,
                 enable_logger: bool = False,
                 logger_path: str = None,
                 logger_comment: str = None,
                 opt_soft_update: bool = False):
        """Initialize an DDPGAgent object.

        Args:
            state_size (int): dimension of each state.
            action_size (int): dimension of each action.
            num_agents (int): number of agents in the environment.
            actor_local (torch.nn.Module): Local Actor model.
            actor_target (torch.nn.Module): Target Actor model.
            actor_optimizer (torch.optim): Actor optimizer.
            critic_local (torch.nn.Module): Local Critic model.
            critic_target (torch.nn.Module): Target Critic model.
            critic_optimizer (torch.optim): Critic optimizer.
            new_hyperparameters (dict): New hyperparameter values.
            seed (int): Random seed.
            device (str): Identifier for device to be used by PyTorch.
            model_output_dir (str): Directory where state dicts will be saved to.
            opt_soft_update (bool): Use soft update instead of hard update.

        Returns:
            An instance of DDPGAgent.
        """
        super(DDPGAgent,
              self).__init__(new_hyperparameters=new_hyperparameters,
                             enable_logger=enable_logger,
                             logger_path=logger_path,
                             logger_comment=logger_comment)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = actor_local if actor_local else Actor(
            state_size, action_size, seed).to(device)
        self.actor_target = actor_target if actor_target else Actor(
            state_size, action_size, seed).to(device)
        self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam(
            self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = critic_local if critic_local else Critic(
            state_size, action_size, seed).to(device)
        self.critic_target = critic_target if critic_target else Critic(
            state_size, action_size, seed).to(device)
        self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam(
            self.critic_local.parameters(),
            lr=self.LEARNING_RATE_CRITIC,
            weight_decay=self.WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE,
                                   self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

        self.state_dicts = [
            (self.actor_local, "actor_local_params"),
            (self.actor_optimizer, "actor_optimizer_params"),
            (self.critic_local, "critic_local_params"),
            (self.critic_optimizer, "critic_optimizer_params"),
        ]

        # Ensure local and target networks have the same initial weight
        hard_update(self.actor_local, self.actor_target)
        hard_update(self.critic_local, self.critic_target)

    def __str__(self) -> str:
        """Helper to output network architecture for the agent.
        
        Returns:
            A string representation of this algorithm.
        """
        return ("{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}".format(
            "Actor (Local):", self.actor_local, "Actor (Target):",
            self.actor_target, "Critic (Local):", self.critic_local,
            "Critic (Target):", self.critic_target))

    def origin(self) -> str:
        """Helper to get the original paper for this algorithm.

        Returns: 
            The original paper for this algorithm.
        """
        return 'https://arxiv.org/pdf/1509.02971.pdf'

    def description(self) -> str:
        """Helper to get a brief description of this algorithm.

        Returns:
            A brief description of this algorithm.
        """
        description = (
            'DDPG was introduced as an actor-critic method that performs well '
            'in environments with a continuous action space, which is a known '
            'limitation of the popular DQN algorithm. It improves on the '
            'deterministic policy gradient (DPG) algorithm by using a neural '
            'network to take advantage of generalization and function approximation.'
        )
        return description

    def step(self,
             states,
             actions,
             rewards,
             next_states,
             dones,
             logger=None) -> None:
        """Save experience in replay memory, and use random sample from buffer to learn.

        Args:
            states: Environment states.
            actions: Environment actions.
            rewards: Rewards for the actions above.
            next_states: Next environment states.
            dones (bool): Boolean indicating if the environment has terminated. 
            logger (Logger): An instance of Logger.
        """
        if self.num_agents == 1:
            self.memory.add(states, actions, rewards, next_states, dones)
        else:
            # TODO: Refactor this to not assume that the objects come in correct shape
            for i in range(self.num_agents):
                self.memory.add(states[i], actions[i], rewards[i],
                                next_states[i], dones[i])

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, logger=logger)

    def act(self, state, add_noise: bool = True, logger=None):
        """Chooses an action for the current state based on the current policy.

        Args:
            state: The current state of the environment.
            add_noise (bool): Controls addition of noise.
            logger (Logger): An instance of Logger.

        Returns: 
            Actions for given state as per current policy.
        """
        state = torch.from_numpy(state).float().to(self.device)

        if self.num_agents == 1:
            self.actor_local.eval()
            with torch.no_grad():
                action = self.actor_local(state).cpu().data.numpy()
            self.actor_local.train()

            if add_noise:
                action += self.noise.sample()

            # TODO: Have parameter that controls this?
            # return np.clip(action, -1, 1)
            return action
        else:
            actions = np.zeros((self.num_agents, self.action_size))
            self.actor_local.eval()
            with torch.no_grad():
                for i, s in enumerate(state):
                    # Populate list of actions one state at a time
                    actions[i, :] = self.actor_local(s).cpu().data.numpy()
            self.actor_local.train()

            if add_noise:
                actions += self.noise.sample()

            # TODO: Have parameter that controls this?
            # return np.clip(action, -1, 1)
            return actions

    def learn(self, experiences, logger=None) -> None:
        """Update policy and value parameters using given batch of experience tuples.

        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples.
            logger (Logger): An instance of Logger.
        """
        states, actions, rewards, next_states, dones = experiences

        ### Update critic
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # adds gradient clipping to stabilize learning
        self.critic_optimizer.step()

        ### Update actor
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ### Update target networks
        if self.opt_soft_update:
            soft_update(self.actor_local, self.actor_target, self.TAU)
            soft_update(self.critic_local, self.critic_target, self.TAU)
        elif self.time_step % self.HARD_UPDATE_EVERY == 0:
            hard_update(self.actor_local, self.actor_target)
            hard_update(self.critic_local, self.critic_target)

        if logger:
            actor_loss = actor_loss.cpu().detach().item()
            critic_loss = critic_loss.cpu().detach().item()
            logger.add_scalars('loss', {
                "actor loss": actor_loss,
                "critic loss": critic_loss,
            }, self.time_step)
Ejemplo n.º 7
0
class MADDPGAgent(Agent):
    """MADDPG implementation."""

    REQUIRED_HYPERPARAMETERS = {
        "buffer_size": int(1e6),
        "batch_size": 64,
        "gamma": 0.99,
        "tau": 1e-3,
        "learning_rate_actor": 1e-4,
        "learning_rate_critic": 1e-3,
        "weight_decay": 1e-2,
        "learn_every": 4,
        "hard_update_every": 5
    }

    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 agents=None,
                 new_hyperparameters=None,
                 seed=0,
                 device="cpu",
                 model_output_dir=None,
                 enable_logger=False,
                 logger_path=None,
                 logger_comment=None,
                 opt_soft_update=False):
        """Initialize a MADDPGAgent wrapper.
       
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): the number of agents in the environment
        """
        raise NotImplementedError()

        super(DDPG, self).__init__(
            new_hyperparameters=new_hyperparameters,
            enable_logger=enable_logger,
            logger_path=logger_path,
            logger_comment=logger_comment
        )

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        self.time_step = 0

        if agents:
            self.agents = agents
        else:
            self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)]

        # Replay memory
        self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed)

        # User options
        self.opt_soft_update = opt_soft_update

        self.model_output_dir = model_output_dir

    def reset(self):
        """Resets OU Noise for each agent."""
        for agent in self.agents:
            agent.reset()

    def act(self, observations, add_noise=False, logger=None):
        """Picks an action for each agent given their individual observations
        and the current policy."""
        actions = []
        for agent, observation in zip(self.agents, observations):
            action = agent.act(observation, add_noise=add_noise)
            actions.append(action)
        return np.array(actions)

    def step(self, observations, actions, rewards, next_observations, dones, logger=None):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        observations = observations.reshape(1, -1)
        actions = actions.reshape(1, -1)
        next_observations = next_observations.reshape(1, -1)

        self.memory.add(observations, actions, rewards, next_observations, dones)

        # Learn every `learn_every` time steps
        self.time_step += 1
        if self.time_step % self.LEARN_EVERY == 0:
            if len(self.memory) > self.BATCH_SIZE:
                for a_i, agent in enumerate(self.agents):
                    experiences = self.memory.sample()
                    self.learn(experiences, a_i, logger=logger)

    def learn(self, experiences, agent_number, logger=None):
        """Helper to pick actions from each agent for the `experiences` tuple that
        will be used to update the weights to agent with ID = `agent_number`.
        Each observation in the `experiences` tuple contains observations from each
        agent, so before using the tuple of update the weights of an agent, we need
        all agents to contribute in generating `next_actions` and `actions_pred`.
        This happens because the critic will take as its input the combined
        observations and actions from all agents."""
        next_actions = []
        actions_pred = []
        states, _, _, next_states, _ = experiences

        next_states = next_states.reshape(-1, self.num_agents, self.state_size)
        states = states.reshape(-1, self.num_agents, self.state_size)

        for a_i, agent in enumerate(self.agents):
            agent_id_tensor = self._get_agent_number(a_i)

            state = states.index_select(1, agent_id_tensor).squeeze(1)
            next_state = next_states.index_select(1, agent_id_tensor).squeeze(1)

            next_actions.append(agent.actor_target(next_state))
            actions_pred.append(agent.actor_local(state))

        next_actions = torch.cat(next_actions, dim=1).to(device)
        actions_pred = torch.cat(actions_pred, dim=1).to(device)

        agent = self.agents[agent_number]
        agent.learn(experiences, next_actions, actions_pred, logger=logger)

    def _get_agent_number(self, i):
        """Helper to get an agent's number as a Torch tensor."""
        return torch.tensor([i]).to(device)
Ejemplo n.º 8
0
 def setUp(self):
     self.batch_size = 2
     self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu")
     self.populate_replay_buffer()