Exemple #1
0
def test_replay_buffer_get_state_with_data():
    # Assign
    batch_size = 10
    buffer_size = 20
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size)

    for (state, action, reward, next_state,
         done) in generate_sample_SARS(buffer_size + 1):
        buffer.add(state=state,
                   action=action,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Act
    state: BufferState = buffer.get_state()
    state_data: BufferState = buffer.get_state(include_data=True)

    # Assert
    assert state == state_data, "Default option is to include all data"
    assert state.type == ReplayBuffer.type
    assert state.batch_size == batch_size
    assert state.buffer_size == buffer_size
    assert len(state.data) == buffer_size

    for d in state.data:
        b_keys = ("state", "action", "reward", "done", "next_state")
        assert all([k in b_keys for k in d.get_dict().keys()])
Exemple #2
0
def test_replay_buffer_add():
    # Assign
    buffer = ReplayBuffer(batch_size=5, buffer_size=5)

    # Act
    assert len(buffer) == 0
    for sars in generate_sample_SARS(1, dict_type=True):
        buffer.add(**sars)

    # Assert
    assert len(buffer) == 1
Exemple #3
0
def test_buffer_size():
    # Assign
    buffer_size = 10
    buffer = ReplayBuffer(batch_size=5, buffer_size=buffer_size)

    # Act
    for (state, action, reward, next_state,
         done) in generate_sample_SARS(buffer_size + 1):
        buffer.add(state=state,
                   action=action,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Assert
    assert len(buffer) == buffer_size
Exemple #4
0
def test_replay_buffer_dump_serializable():
    import json
    import torch
    # Assign
    filled_buffer = 8
    buffer = ReplayBuffer(batch_size=5, buffer_size=10)

    for sars in generate_sample_SARS(filled_buffer, dict_type=True):
        sars['state'] = torch.tensor(sars['state'])
        sars['next_state'] = torch.tensor(sars['next_state'])
        buffer.add(**sars)

    # Act
    dump = list(buffer.dump_buffer(serialize=True))

    # Assert
    ser_dump = json.dumps(dump)
    assert isinstance(ser_dump, str)
    assert json.loads(ser_dump) == dump
Exemple #5
0
def test_replay_buffer_dump():
    import torch
    # Assign
    filled_buffer = 8
    prop_keys = ["state", "action", "reward", "next_state"]
    buffer = ReplayBuffer(batch_size=5, buffer_size=10)
    for sars in generate_sample_SARS(filled_buffer):
        buffer.add(state=torch.tensor(sars[0]),
                   reward=sars[1],
                   action=[sars[2]],
                   next_state=torch.tensor(sars[3]),
                   dones=sars[4])

    # Act
    dump = list(buffer.dump_buffer())

    # Assert
    assert all([len(dump) == filled_buffer])
    assert all([key in dump[0] for key in prop_keys])
Exemple #6
0
def test_replay_buffer_sample():
    # Assign
    batch_size = 5
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=10)

    # Act
    for (state, actions, reward, next_state, done) in generate_sample_SARS(20):
        buffer.add(state=state,
                   action=actions,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Assert
    samples = buffer.sample()
    # (states, actions, rewards, next_states, dones)
    assert len(samples["state"]) == batch_size
    assert len(samples["action"]) == batch_size
    assert len(samples["reward"]) == batch_size
    assert len(samples["next_state"]) == batch_size
    assert len(samples["done"]) == batch_size
Exemple #7
0
def test_replay_buffer_get_state_without_data():
    # Assign
    batch_size = 10
    buffer_size = 20
    buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size)

    for (state, action, reward, next_state,
         done) in generate_sample_SARS(buffer_size + 1):
        buffer.add(state=state,
                   action=action,
                   reward=reward,
                   next_state=next_state,
                   done=done)

    # Act
    state: BufferState = buffer.get_state(include_data=False)

    # Assert
    assert state.type == ReplayBuffer.type
    assert state.batch_size == batch_size
    assert state.buffer_size == buffer_size
    assert state.data is None
Exemple #8
0
def test_replay_buffer_seed():
    # Assign
    batch_size = 4
    buffer_0 = ReplayBuffer(batch_size)
    buffer_1 = ReplayBuffer(batch_size, seed=32167)
    buffer_2 = ReplayBuffer(batch_size, seed=32167)

    # Act
    for sars in generate_sample_SARS(400, dict_type=True):
        buffer_0.add(**copy.deepcopy(sars))
        buffer_1.add(**copy.deepcopy(sars))
        buffer_2.add(**copy.deepcopy(sars))

    # Assert
    for _ in range(10):
        samples_0 = buffer_0.sample()
        samples_1 = buffer_1.sample()
        samples_2 = buffer_2.sample()

        assert samples_0 != samples_1
        assert samples_0 != samples_2
        assert samples_1 == samples_2
Exemple #9
0
class DDPGAgent(AgentBase):
    """
    Deep Deterministic Policy Gradients (DDPG).

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "DDPG"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 actor_lr: float = 2e-3,
                 critic_lr: float = 2e-3,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.device = self._register_param(kwargs, "device", DEVICE)
        self.state_size = state_size
        self.action_size = action_size

        # Reason sequence initiation.
        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (128, 128)))
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers,
                               gate_out=torch.tanh).to(self.device)
        self.critic = CriticBody(state_size,
                                 action_size,
                                 hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers,
                                      gate_out=torch.tanh).to(self.device)
        self.target_critic = CriticBody(state_size,
                                        action_size,
                                        hidden_layers=hidden_layers).to(
                                            self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=self.device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_lr = float(
            self._register_param(kwargs, 'actor_lr', actor_lr))
        self.critic_lr = float(
            self._register_param(kwargs, 'critic_lr', critic_lr))
        self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.max_grad_norm_actor = float(
            self._register_param(kwargs, "max_grad_norm_actor", 10.0))
        self.max_grad_norm_critic = float(
            self._register_param(kwargs, "max_grad_norm_critic", 10.0))
        self.action_min = float(self._register_param(kwargs, 'action_min', -1))
        self.action_max = float(self._register_param(kwargs, 'action_max', 1))
        self.action_scale = float(
            self._register_param(kwargs, 'action_scale', 1))

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0
        self._loss_actor = 0.
        self._loss_critic = 0.

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    @property
    def loss(self) -> Dict[str, float]:
        return {'actor': self._loss_actor, 'critic': self._loss_critic}

    @loss.setter
    def loss(self, value):
        if isinstance(value, dict):
            self._loss_actor = value['actor']
            self._loss_critic = value['critic']
        else:
            self._loss_actor = value
            self._loss_critic = value

    @torch.no_grad()
    def act(self, obs, noise: float = 0.0) -> List[float]:
        """Acting on the observations. Returns action.

        Returns:
            action: (list float) Action values.
        """
        obs = to_tensor(obs).float().to(self.device)
        action = self.actor(obs)
        action += noise * self.noise.sample()
        action = torch.clamp(action * self.action_scale, self.action_min,
                             self.action_max)
        return action.cpu().numpy().tolist()

    def step(self, state, action, reward, next_state, done) -> None:
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample())

    def compute_value_loss(self, states, actions, next_states, rewards, dones):
        next_actions = self.target_actor.act(next_states)
        assert next_actions.shape == actions.shape
        Q_target_next = self.target_critic.act(next_states, next_actions)
        Q_target = rewards + self.gamma * Q_target_next * (1 - dones)
        Q_expected = self.critic(states, actions)
        assert Q_expected.shape == Q_target.shape == Q_target_next.shape
        return mse_loss(Q_expected, Q_target)

    def compute_policy_loss(self, states) -> None:
        """Compute Policy loss based on provided states.

        Loss = Mean(-Q(s, _a) ),
        where _a is actor's estimate based on state, _a = Actor(s).
        """
        pred_actions = self.actor(states)
        return -self.critic(states, pred_actions).mean()

    def learn(self, experiences) -> None:
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size,
                                                     self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        # Value (critic) optimization
        loss_critic = self.compute_value_loss(states, actions, next_states,
                                              rewards, dones)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

        # Soft update target weights
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def state_dict(self) -> Dict[str, dict]:
        """Describes agent's networks.

        Returns:
            state: (dict) Provides actors and critics states.

        """
        return {
            "actor": self.actor.state_dict(),
            "target_actor": self.target_actor.state_dict(),
            "critic": self.critic.state_dict(),
            "target_critic": self.target_critic.state_dict()
        }

    def log_metrics(self,
                    data_logger: DataLogger,
                    step: int,
                    full_log: bool = False):
        data_logger.log_value("loss/actor", self._loss_actor, step)
        data_logger.log_value("loss/critic", self._loss_critic, step)

        if full_log:
            for idx, layer in enumerate(self.actor.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"actor/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"actor/layer_bias_{idx}",
                                                 layer.bias, step)

            for idx, layer in enumerate(self.critic.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"critic/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"critic/layer_bias_{idx}",
                                                 layer.bias, step)

    def get_state(self) -> AgentState:
        net = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        network_state: NetworkState = NetworkState(net=net)
        return AgentState(model=self.name,
                          state_space=self.state_size,
                          action_space=self.action_size,
                          config=self._config,
                          buffer=self.buffer.get_state(),
                          network=network_state)

    def save_state(self, path: str) -> None:
        agent_state = self.get_state()
        torch.save(agent_state, path)

    def load_state(self,
                   *,
                   path: Optional[str] = None,
                   agent_state: Optional[dict] = None):
        if path is None and agent_state:
            raise ValueError(
                "Either `path` or `agent_state` must be provided to load agent's state."
            )
        if path is not None and agent_state is None:
            agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)

        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
Exemple #10
0
class DDPGAgent(AgentType):
    """
    Deep Deterministic Policy Gradients (DDPG).

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "DDPG"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 hidden_layers: Sequence[int] = (128, 128),
                 actor_lr: float = 2e-3,
                 actor_lr_decay: float = 0,
                 critic_lr: float = 2e-3,
                 critic_lr_decay: float = 0,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 clip: Tuple[int, int] = (-1, 1),
                 config=None,
                 device=None,
                 **kwargs):
        config = config if config is not None else dict()
        self.device = device if device is not None else DEVICE

        # Reason sequence initiation.
        self.hidden_layers = config.get('hidden_layers', hidden_layers)
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers).to(self.device)
        self.critic = CriticBody(state_size,
                                 action_size,
                                 hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers).to(
                                          self.device)
        self.target_critic = CriticBody(state_size,
                                        action_size,
                                        hidden_layers=hidden_layers).to(
                                            self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=actor_lr,
                                    weight_decay=actor_lr_decay)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=critic_lr,
                                     weight_decay=critic_lr_decay)
        self.action_min = clip[0]
        self.action_max = clip[1]
        self.action_scale = config.get('action_scale', 1)

        self.gamma: float = float(config.get('gamma', 0.99))
        self.tau: float = float(config.get('tau', 0.02))
        self.batch_size: int = int(config.get('batch_size', 64))
        self.buffer_size: int = int(config.get('buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up: int = int(config.get('warm_up', 0))
        self.update_freq: int = int(config.get('update_freq', 1))
        self.number_updates: int = int(config.get('number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    def act(self, obs, noise: float = 0.0):
        with torch.no_grad():
            obs = torch.tensor(obs.astype(np.float32)).to(self.device)
            action = self.actor(obs)
            action += noise * self.noise.sample()
            return self.action_scale * torch.clamp(
                action, self.action_min, self.action_max).cpu().numpy().astype(
                    np.float32)

    def target_act(self, obs, noise: float = 0.0):
        with torch.no_grad():
            obs = torch.tensor(obs).to(self.device)
            action = self.target_actor(obs) + noise * self.noise.sample()
            return torch.clamp(action, self.action_min,
                               self.action_max).cpu().numpy().astype(
                                   np.float32)

    def step(self, state, action, reward, next_state, done):
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample_sars())

    def learn(self, samples):
        """update the critics and actors of all the agents """

        states, actions, rewards, next_states, dones = samples
        rewards = rewards.to(self.device)
        dones = dones.type(torch.int).to(self.device)
        states = states.to(self.device)
        next_states = next_states.to(self.device)
        actions = actions.to(self.device)

        # critic loss
        next_actions = self.target_actor(next_states)
        Q_target_next = self.target_critic(next_states, next_actions)
        Q_target = rewards + (self.gamma * Q_target_next * (1 - dones))
        Q_expected = self.critic(states, actions)
        critic_loss = mse_loss(Q_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip)
        self.critic_optimizer.step()
        self.critic_loss = critic_loss.item()

        # Compute actor loss
        pred_actions = self.actor(states)
        actor_loss = -self.critic(states, pred_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.actor_loss = actor_loss.item()

        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def describe_agent(self) -> Tuple[Any, Any, Any, Any]:
        """
        Returns network's weights in order:
        Actor, TargetActor, Critic, TargetCritic
        """
        return (self.actor.state_dict(), self.target_actor.state_dict(),
                self.critic.state_dict(), self.target_critic())

    def log_writer(self, writer, episode):
        writer.add_scalar("loss/actor", self.actor_loss, episode)
        writer.add_scalar("loss/critic", self.critic_loss, episode)

    def save_state(self, path: str):
        agent_state = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        torch.save(agent_state, path)

    def load_state(self, path: str):
        agent_state = torch.load(path)
        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
Exemple #11
0
class TD3Agent(AgentBase):
    """
    Twin Delayed Deep Deterministic (TD3) Policy Gradient.

    In short, it's a slightly modified/improved version of the DDPG. Compared to the DDPG in this package,
    which uses Guassian noise, this TD3 uses Ornstein–Uhlenbeck process as the noise.
    """

    name = "TD3"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 **kwargs):
        """
        Parameters:
            state_size (int): Number of input dimensions.
            action_size (int): Number of output dimensions
            noise_scale (float): Added noise amplitude. Default: 0.2.
            noise_sigma (float): Added noise variance. Default: 0.1.

        Keyword parameters:
            hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (128, 128).
            actor_lr (float): Learning rate for the actor (policy). Default: 0.003.
            critic_lr (float): Learning rate for the critic (value function). Default: 0.003.
            gamma (float): Discount value. Default: 0.99.
            tau (float): Soft-copy factor. Default: 0.02.
            actor_hidden_layers (tuple of ints): Shape of network for actor. Default: `hideen_layers`.
            critic_hidden_layers (tuple of ints): Shape of network for critic. Default: `hideen_layers`.
            max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 100.
            max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 100.
            batch_size (int): Number of samples used in learning. Default: 64.
            buffer_size (int): Maximum number of samples to store. Default: 1e6.
            warm_up (int): Number of samples to observe before starting any learning step. Default: 0.
            update_freq (int): Number of steps between each learning step. Default 1.
            number_updates (int): How many times to use learning step in the learning phase. Default: 1.
            action_min (float): Minimum returned action value. Default: -1.
            action_max (float): Maximum returned action value. Default: 1.
            action_scale (float): Multipler value for action. Default: 1.

        """
        super().__init__(**kwargs)
        self.device = self._register_param(
            kwargs, "device", DEVICE)  # Default device is CUDA if available

        # Reason sequence initiation.
        self.state_size = state_size
        self.action_size = action_size

        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (128, 128)))
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers).to(self.device)
        self.critic = DoubleCritic(state_size,
                                   action_size,
                                   CriticBody,
                                   hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers).to(
                                          self.device)
        self.target_critic = DoubleCritic(state_size,
                                          action_size,
                                          CriticBody,
                                          hidden_layers=hidden_layers).to(
                                              self.device)

        # Noise sequence initiation
        # self.noise = GaussianNoise(shape=(action_size,), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device)
        self.noise = OUProcess(shape=action_size,
                               scale=noise_scale,
                               sigma=noise_sigma,
                               device=self.device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-3))
        critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-3))
        self.actor_optimizer = AdamW(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = AdamW(self.critic.parameters(), lr=critic_lr)
        self.max_grad_norm_actor: float = float(
            kwargs.get("max_grad_norm_actor", 100))
        self.max_grad_norm_critic: float = float(
            kwargs.get("max_grad_norm_critic", 100))
        self.action_min = float(self._register_param(kwargs, 'action_min',
                                                     -1.))
        self.action_max = float(self._register_param(kwargs, 'action_max', 1.))
        self.action_scale = float(
            self._register_param(kwargs, 'action_scale', 1.))

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.update_policy_freq = int(
            self._register_param(kwargs, 'update_policy_freq', 1))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))
        self.noise_reset_freq = int(
            self._register_param(kwargs, 'noise_reset_freq', 10000))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0
        self._loss_actor = 0.
        self._loss_critic = 0.

    @property
    def loss(self) -> Dict[str, float]:
        return {'actor': self._loss_actor, 'critic': self._loss_critic}

    @loss.setter
    def loss(self, value):
        if isinstance(value, dict):
            self._loss_actor = value['actor']
            self._loss_critic = value['critic']
        else:
            self._loss_actor = value
            self._loss_critic = value

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    def act(self,
            state,
            epsilon: float = 0.0,
            training_mode=True) -> List[float]:
        """
        Agent acting on observations.

        When the training_mode is True (default) a noise is added to each action.
        """
        # Epsilon greedy
        if self._rng.random() < epsilon:
            rnd_actions = torch.rand(self.action_size) * (
                self.action_max - self.action_min) - self.action_min
            return rnd_actions.tolist()

        with torch.no_grad():
            state = to_tensor(state).float().to(self.device)
            action = self.actor(state)
            if training_mode:
                action += self.noise.sample()
            return (self.action_scale * torch.clamp(action, self.action_min,
                                                    self.action_max)).tolist()

    def target_act(self, staten, noise: float = 0.0):
        with torch.no_grad():
            staten = to_tensor(staten).float().to(self.device)
            action = self.target_actor(staten) + noise * self.noise.sample()
            return torch.clamp(action, self.action_min,
                               self.action_max).cpu().numpy().astype(
                                   np.float32)

    def step(self, state, action, reward, next_state, done):
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if (self.iteration % self.noise_reset_freq) == 0:
            self.noise.reset_states()

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) <= self.batch_size:
            return

        if not (self.iteration % self.update_freq) or not (
                self.iteration % self.update_policy_freq):
            for _ in range(self.number_updates):
                # Note: Inside this there's a delayed policy update.
                #       Every `update_policy_freq` it will learn `number_updates` times.
                self.learn(self.buffer.sample())

    def learn(self, experiences):
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)

        if (self.iteration % self.update_freq) == 0:
            self._update_value_function(states, actions, rewards, next_states,
                                        dones)

        if (self.iteration % self.update_policy_freq) == 0:
            self._update_policy(states)

            soft_update(self.target_actor, self.actor, self.tau)
            soft_update(self.target_critic, self.critic, self.tau)

    def _update_value_function(self, states, actions, rewards, next_states,
                               dones):
        # critic loss
        next_actions = self.target_actor.act(next_states)
        Q_target_next = torch.min(
            *self.target_critic.act(next_states, next_actions))
        Q_target = rewards + (self.gamma * Q_target_next * (1 - dones))
        Q1_expected, Q2_expected = self.critic(states, actions)
        loss_critic = mse_loss(Q1_expected, Q_target) + mse_loss(
            Q2_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

    def _update_policy(self, states):
        # Compute actor loss
        pred_actions = self.actor(states)
        loss_actor = -self.critic(states, pred_actions)[0].mean()
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

    def state_dict(self) -> Dict[str, dict]:
        """Describes agent's networks.

        Returns:
            state: (dict) Provides actors and critics states.

        """
        return {
            "actor": self.actor.state_dict(),
            "target_actor": self.target_actor.state_dict(),
            "critic": self.critic.state_dict(),
            "target_critic": self.target_critic()
        }

    def log_metrics(self,
                    data_logger: DataLogger,
                    step: int,
                    full_log: bool = False):
        data_logger.log_value("loss/actor", self._loss_actor, step)
        data_logger.log_value("loss/critic", self._loss_critic, step)

    def get_state(self):
        return dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
            config=self._config,
        )

    def save_state(self, path: str):
        agent_state = self.get_state()
        torch.save(agent_state, path)

    def load_state(self, path: str):
        agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)

        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
Exemple #12
0
class MADDPGAgent(MultiAgentType):

    name = "MADDPG"

    def __init__(self, state_size: int, action_size: int, num_agents: int, **kwargs):
        """Initiation of the Multi Agent DDPG.

        All keywords are also passed to DDPG agents.

        Parameters:
            state_size (int): Dimensionality of the state.
            action_size (int): Dimensionality of the action.
            num_agents (int): Number of agents.

        Keyword Arguments:
            hidden_layers (tuple of ints): Shape for fully connected hidden layers.
            noise_scale (float): Default: 1.0. Noise amplitude.
            noise_sigma (float): Default: 0.5. Noise variance.
            actor_lr (float): Default: 0.001. Learning rate for actor network.
            critic_lr (float): Default: 0.001. Learning rate for critic network.
            gamma (float): Default: 0.99. Discount value
            tau (float): Default: 0.02. Soft copy value.
            gradient_clip (optional float): Max norm for learning gradient. If None then no clip.
            batch_size (int): Number of samples per learning.
            buffer_size (int): Number of previous samples to remember.
            warm_up (int): Number of samples to see before start learning.
            update_freq (int): How many samples between learning sessions.
            number_updates (int): How many learning cycles per learning session.

        """

        self.device = self._register_param(kwargs, "device", DEVICE, update=True)
        self.state_size: int = state_size
        self.action_size = action_size
        self.num_agents: int = num_agents
        self.agent_names: List[str] = kwargs.get("agent_names", map(str, range(self.num_agents)))

        hidden_layers = to_numbers_seq(self._register_param(kwargs, 'hidden_layers', (100, 100), update=True))
        noise_scale = float(self._register_param(kwargs, 'noise_scale', 0.5))
        noise_sigma = float(self._register_param(kwargs, 'noise_sigma', 1.0))
        actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4))
        critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4))

        self.agents: Dict[str, DDPGAgent] = OrderedDict({
            agent_name: DDPGAgent(
                state_size, action_size,
                actor_lr=actor_lr, critic_lr=critic_lr,
                noise_scale=noise_scale, noise_sigma=noise_sigma,
                **kwargs,
            ) for agent_name in self.agent_names
        })

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.gradient_clip: Optional[float] = self._register_param(kwargs, 'gradient_clip')

        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.number_updates = int(self._register_param(kwargs, 'number_updates', 1))

        self.critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device)
        self.target_critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
        hard_update(self.target_critic, self.critic)

        self._step_data = {}
        self._loss_critic: float = float('inf')
        self._loss_actor: Dict[str, float] = {name: float('inf') for name in self.agent_names}
        self.reset()

    @property
    def loss(self) -> Dict[str, float]:
        out = {}
        for agent_name, agent in self.agents.items():
            for loss_name, loss_value in agent.loss.items():
                out[f"{agent_name}_{loss_name}"] = loss_value
            out[f"{agent_name}_actor"] = self._loss_actor[agent_name]
        out["critic"] = self._loss_critic
        return out

    def reset(self):
        self.iteration = 0
        self.reset_agents()

    def reset_agents(self):
        for agent in self.agents.values():
            agent.reset_agent()
        self.critic.reset_parameters()
        self.target_critic.reset_parameters()

    def step(self, agent_name: str, state: StateType, action: ActionType, reward, next_state, done) -> None:
        self._step_data[agent_name] = dict(
            state=state, action=action, reward=reward, next_state=next_state, done=done,
        )

    def commit(self):
        step_data = defaultdict(list)
        for agent in self.agents:
            agent_data = self._step_data[agent]
            step_data['state'].append(agent_data['state'])
            step_data['action'].append(agent_data['action'])
            step_data['reward'].append(agent_data['reward'])
            step_data['next_state'].append(agent_data['next_state'])
            step_data['done'].append(agent_data['done'])

        self.buffer.add(**step_data)
        self._step_data = {}
        self.iteration += 1
        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0:
            for _ in range(self.number_updates):
                samples = self.buffer.sample()
                for agent_name in self.agents:
                    self.learn(samples, agent_name)
            self.update_targets()

    @torch.no_grad()
    def act(self, agent_name: str, states: List[StateType], noise: float=0.0) -> List[float]:
        """Get actions from all agents. Synchronized action.

        Parameters:
            states: List of states per agent. Positions need to be consistent.
            noise: Scale for the noise to include

        Returns:
            actions: List of actions that each agent wants to perform

        """
        tensor_states = torch.tensor(states).reshape(-1)
        agent = self.agents[agent_name]
        action = agent.act(tensor_states, noise)
        return action

    def __flatten_actions(self, actions):
        return actions.view(-1, self.num_agents*self.action_size)

    def learn(self, experiences, agent_name: str) -> None:
        """update the critics and actors of all the agents """

        # TODO: Just look at this mess.
        agent_number = list(self.agents).index(agent_name)
        agent_rewards = to_tensor(experiences['reward']).select(1, agent_number).unsqueeze(-1).float().to(self.device)
        agent_dones = to_tensor(experiences['done']).select(1, agent_number).unsqueeze(-1).type(torch.int).to(self.device)
        states = to_tensor(experiences['state']).to(self.device).view(self.batch_size, self.num_agents, self.state_size)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(self.device).view(self.batch_size, self.num_agents, self.state_size)
        flat_states = states.view(-1, self.num_agents*self.state_size)
        flat_next_states = next_states.view(-1, self.num_agents*self.state_size)
        flat_actions = actions.view(-1, self.num_agents*self.action_size)
        assert agent_rewards.shape == agent_dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size, self.num_agents, self.state_size)
        assert actions.shape == (self.batch_size, self.num_agents, self.action_size)
        assert flat_actions.shape == (self.batch_size, self.num_agents*self.action_size)

        agent = self.agents[agent_name]

        next_actions = actions.detach().clone()
        next_actions.data[:, agent_number] = agent.target_actor(next_states[:, agent_number, :])
        assert next_actions.shape == (self.batch_size, self.num_agents, self.action_size)

        # critic loss
        Q_target_next = self.target_critic(flat_next_states, self.__flatten_actions(next_actions))
        Q_target = agent_rewards + (self.gamma * Q_target_next * (1 - agent_dones))
        Q_expected = self.critic(flat_states, flat_actions)
        loss_critic = F.mse_loss(Q_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        if self.gradient_clip:
            nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.mean().item())

        # Compute actor loss
        pred_actions = actions.detach().clone()
        # pred_actions.data[:, agent_number] = agent.actor(flat_states)
        pred_actions.data[:, agent_number] = agent.actor(states[:, agent_number, :])

        loss_actor = -self.critic(flat_states, self.__flatten_actions(pred_actions)).mean()
        agent.actor_optimizer.zero_grad()
        loss_actor.backward()
        agent.actor_optimizer.step()
        self._loss_actor[agent_name] = loss_actor.mean().item()

    def update_targets(self):
        """soft update targets"""
        for agent in self.agents.values():
            soft_update(agent.target_actor, agent.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def log_metrics(self, data_logger: DataLogger, step: int, full_log: bool=False):
        data_logger.log_value('loss/critic', self._loss_critic, step)
        for agent_name, agent in self.agents.items():
            data_logger.log_values_dict(f"{agent_name}/loss", agent.loss, step)

    def get_state(self) -> Dict[str, dict]:
        """Returns agents' internal states"""
        agents_state = {}
        agents_state['config'] = self._config
        for agent_name, agent in self.agents.items():
            agents_state[agent_name] = {"state": agent.state_dict(), "config": agent._config}
        return agents_state

    def save_state(self, path: str):
        """Saves current state of the Multi Agent instance and all related agents.

        All states are stored via PyTorch's :func:`save <torch.save>` function.

        Parameters:
            path: (str) String path to a location where the state is store.

        """
        agents_state = self.get_state()
        torch.save(agents_state, path)

    def load_state(self, *, path: Optional[str]=None, agent_state: Optional[dict]=None) -> None:
        """Loads the state into the Multi Agent.

        The state can be provided either via path to a file that contains the state,
        see :meth:`save_state <self.save_state>`, or direclty via `state`.

        Parameters:
            path: (str) A path where the state was saved via `save_state`.
            state: (dict) Already loaded state kept in memory.

        """
        if path is None and agent_state:
            raise ValueError("Either `path` or `agent_state` must be provided to load agent's state.")
        if path is not None and agent_state is None:
            agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)
        for agent_name, agent in self.agents.items():
            _agent_state = agent_state[agent_name]
            agent.load_state(agent_state=_agent_state["state"])
            agent._config = _agent_state['config']
            agent.__dict__.update(**agent._config)

    def seed(self, seed: int) -> None:
        for agent in self.agents.values():
            agent.seed(seed)

    def state_dict(self) -> Dict[str, Any]:
        return {name: agent.state_dict() for (name, agent) in self.agents.items()}
Exemple #13
0
class DDPGAgent(AgentBase):
    """
    Deep Deterministic Policy Gradients (DDPG).

    Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise.
    """

    name = "DDPG"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 noise_scale: float = 0.2,
                 noise_sigma: float = 0.1,
                 **kwargs):
        """
        Parameters:
            state_size: Number of input dimensions.
            action_size: Number of output dimensions
            noise_scale (float): Added noise amplitude. Default: 0.2.
            noise_sigma (float): Added noise variance. Default: 0.1.

        Keyword parameters:
            hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (64, 64).
            gamma (float): Discount value. Default: 0.99.
            tau (float): Soft-copy factor. Default: 0.002.
            actor_lr (float): Learning rate for the actor (policy). Default: 0.0003.
            critic_lr (float): Learning rate for the critic (value function). Default: 0.0003.
            max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 10.
            max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 10.
            batch_size (int): Number of samples used in learning. Default: 64.
            buffer_size (int): Maximum number of samples to store. Default: 1e6.
            warm_up (int): Number of samples to observe before starting any learning step. Default: 0.
            update_freq (int): Number of steps between each learning step. Default 1.
            number_updates (int): How many times to use learning step in the learning phase. Default: 1.
            action_min (float): Minimum returned action value. Default: -1.
            action_max (float): Maximum returned action value. Default: 1.
            action_scale (float): Multipler value for action. Default: 1.

        """
        super().__init__(**kwargs)
        self.device = self._register_param(kwargs, "device", DEVICE)
        self.state_size = state_size
        self.action_size = action_size

        # Reason sequence initiation.
        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (64, 64)))
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers,
                               gate_out=torch.tanh).to(self.device)
        self.critic = CriticBody(state_size,
                                 action_size,
                                 hidden_layers=hidden_layers).to(self.device)
        self.target_actor = ActorBody(state_size,
                                      action_size,
                                      hidden_layers=hidden_layers,
                                      gate_out=torch.tanh).to(self.device)
        self.target_critic = CriticBody(state_size,
                                        action_size,
                                        hidden_layers=hidden_layers).to(
                                            self.device)

        # Noise sequence initiation
        self.noise = GaussianNoise(shape=(action_size, ),
                                   mu=1e-8,
                                   sigma=noise_sigma,
                                   scale=noise_scale,
                                   device=self.device)

        # Target sequence initiation
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # Optimization sequence initiation.
        self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4))
        self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4))
        self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.critic_lr)
        self.max_grad_norm_actor = float(
            self._register_param(kwargs, "max_grad_norm_actor", 10.0))
        self.max_grad_norm_critic = float(
            self._register_param(kwargs, "max_grad_norm_critic", 10.0))
        self.action_min = float(self._register_param(kwargs, 'action_min', -1))
        self.action_max = float(self._register_param(kwargs, 'action_max', 1))
        self.action_scale = float(
            self._register_param(kwargs, 'action_scale', 1))

        self.gamma = float(self._register_param(kwargs, 'gamma', 0.99))
        self.tau = float(self._register_param(kwargs, 'tau', 0.02))
        self.batch_size = int(self._register_param(kwargs, 'batch_size', 64))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e6)))
        self.buffer = ReplayBuffer(self.batch_size, self.buffer_size)

        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0
        self._loss_actor = 0.
        self._loss_critic = 0.

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.critic.reset_parameters()
        self.target_actor.reset_parameters()
        self.target_critic.reset_parameters()

    @property
    def loss(self) -> Dict[str, float]:
        return {'actor': self._loss_actor, 'critic': self._loss_critic}

    @loss.setter
    def loss(self, value):
        if isinstance(value, dict):
            self._loss_actor = value['actor']
            self._loss_critic = value['critic']
        else:
            self._loss_actor = value
            self._loss_critic = value

    def __eq__(self, o: object) -> bool:
        return super().__eq__(o) \
            and self._config == o._config \
            and self.buffer == o.buffer \
            and self.get_network_state() == o.get_network_state()

    @torch.no_grad()
    def act(self, obs, noise: float = 0.0) -> List[float]:
        """Acting on the observations. Returns action.

        Returns:
            action: (list float) Action values.
        """
        obs = to_tensor(obs).float().to(self.device)
        action = self.actor(obs)
        action += noise * self.noise.sample()
        action = torch.clamp(action * self.action_scale, self.action_min,
                             self.action_max)
        return action.cpu().numpy().tolist()

    def step(self, state, action, reward, next_state, done) -> None:
        self.iteration += 1
        self.buffer.add(state=state,
                        action=action,
                        reward=reward,
                        next_state=next_state,
                        done=done)

        if self.iteration < self.warm_up:
            return

        if len(self.buffer) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.buffer.sample())

    def compute_value_loss(self, states, actions, next_states, rewards, dones):
        next_actions = self.target_actor.act(next_states)
        assert next_actions.shape == actions.shape
        Q_target_next = self.target_critic.act(next_states, next_actions)
        Q_target = rewards + self.gamma * Q_target_next * (1 - dones)
        Q_expected = self.critic(states, actions)
        assert Q_expected.shape == Q_target.shape == Q_target_next.shape
        return mse_loss(Q_expected, Q_target)

    def compute_policy_loss(self, states) -> None:
        """Compute Policy loss based on provided states.

        Loss = Mean(-Q(s, _a) ),
        where _a is actor's estimate based on state, _a = Actor(s).
        """
        pred_actions = self.actor(states)
        return -self.critic(states, pred_actions).mean()

    def learn(self, experiences) -> None:
        """Update critics and actors"""
        rewards = to_tensor(experiences['reward']).float().to(
            self.device).unsqueeze(1)
        dones = to_tensor(experiences['done']).type(torch.int).to(
            self.device).unsqueeze(1)
        states = to_tensor(experiences['state']).float().to(self.device)
        actions = to_tensor(experiences['action']).to(self.device)
        next_states = to_tensor(experiences['next_state']).float().to(
            self.device)
        assert rewards.shape == dones.shape == (self.batch_size, 1)
        assert states.shape == next_states.shape == (self.batch_size,
                                                     self.state_size)
        assert actions.shape == (self.batch_size, self.action_size)

        # Value (critic) optimization
        loss_critic = self.compute_value_loss(states, actions, next_states,
                                              rewards, dones)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        nn.utils.clip_grad_norm_(self.critic.parameters(),
                                 self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self._loss_critic = float(loss_critic.item())

        # Policy (actor) optimization
        loss_actor = self.compute_policy_loss(states)
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        nn.utils.clip_grad_norm_(self.actor.parameters(),
                                 self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self._loss_actor = loss_actor.item()

        # Soft update target weights
        soft_update(self.target_actor, self.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)

    def state_dict(self) -> Dict[str, dict]:
        """Describes agent's networks.

        Returns:
            state: (dict) Provides actors and critics states.

        """
        return {
            "actor": self.actor.state_dict(),
            "target_actor": self.target_actor.state_dict(),
            "critic": self.critic.state_dict(),
            "target_critic": self.target_critic.state_dict()
        }

    def log_metrics(self,
                    data_logger: DataLogger,
                    step: int,
                    full_log: bool = False):
        data_logger.log_value("loss/actor", self._loss_actor, step)
        data_logger.log_value("loss/critic", self._loss_critic, step)

        if full_log:
            for idx, layer in enumerate(self.actor.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"actor/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"actor/layer_bias_{idx}",
                                                 layer.bias, step)

            for idx, layer in enumerate(self.critic.layers):
                if hasattr(layer, "weight"):
                    data_logger.create_histogram(f"critic/layer_weights_{idx}",
                                                 layer.weight, step)
                if hasattr(layer, "bias") and layer.bias is not None:
                    data_logger.create_histogram(f"critic/layer_bias_{idx}",
                                                 layer.bias, step)

    def get_state(self) -> AgentState:
        return AgentState(
            model=self.name,
            state_space=self.state_size,
            action_space=self.action_size,
            config=self._config,
            buffer=copy.deepcopy(self.buffer.get_state()),
            network=copy.deepcopy(self.get_network_state()),
        )

    def get_network_state(self) -> NetworkState:
        net = dict(
            actor=self.actor.state_dict(),
            target_actor=self.target_actor.state_dict(),
            critic=self.critic.state_dict(),
            target_critic=self.target_critic.state_dict(),
        )
        return NetworkState(net=net)

    @staticmethod
    def from_state(state: AgentState) -> AgentBase:
        config = copy.copy(state.config)
        config.update({
            'state_size': state.state_space,
            'action_size': state.action_space
        })
        agent = DDPGAgent(**config)
        if state.network is not None:
            agent.set_network(state.network)
        if state.buffer is not None:
            agent.set_buffer(state.buffer)
        return agent

    def set_buffer(self, buffer_state: BufferState) -> None:
        self.buffer = BufferFactory.from_state(buffer_state)

    def set_network(self, network_state: NetworkState) -> None:
        self.actor.load_state_dict(copy.deepcopy(network_state.net['actor']))
        self.target_actor.load_state_dict(network_state.net['target_actor'])
        self.critic.load_state_dict(network_state.net['critic'])
        self.target_critic.load_state_dict(network_state.net['target_critic'])

    def save_state(self, path: str) -> None:
        agent_state = self.get_state()
        torch.save(agent_state, path)

    def load_state(self,
                   *,
                   path: Optional[str] = None,
                   agent_state: Optional[dict] = None):
        if path is None and agent_state:
            raise ValueError(
                "Either `path` or `agent_state` must be provided to load agent's state."
            )
        if path is not None and agent_state is None:
            agent_state = torch.load(path)
        self._config = agent_state.get('config', {})
        self.__dict__.update(**self._config)

        self.actor.load_state_dict(agent_state['actor'])
        self.critic.load_state_dict(agent_state['critic'])
        self.target_actor.load_state_dict(agent_state['target_actor'])
        self.target_critic.load_state_dict(agent_state['target_critic'])
Exemple #14
0
class SACAgent(AgentType):
    """
    Soft Actor-Critic.

    Uses stochastic policy and dual value network (two critics).

    Based on
    "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor"
    by Haarnoja et al. (2018) (http://arxiv.org/abs/1801.01290).
    """

    name = "SAC"

    def __init__(self,
                 state_size: int,
                 action_size: int,
                 hidden_layers: Sequence[int] = (128, 128),
                 actor_lr: float = 2e-3,
                 critic_lr: float = 2e-3,
                 clip: Tuple[int, int] = (-1, 1),
                 alpha: float = 0.2,
                 device=None,
                 **kwargs):
        self.device = device if device is not None else DEVICE
        self.action_size = action_size

        # Reason sequence initiation.
        self.hidden_layers = kwargs.get('hidden_layers', hidden_layers)
        self.policy = GaussianPolicy(action_size).to(self.device)
        self.actor = ActorBody(state_size,
                               action_size,
                               hidden_layers=hidden_layers).to(self.device)

        self.double_critic = DoubleCritic(state_size, action_size,
                                          hidden_layers).to(self.device)
        self.target_double_critic = DoubleCritic(state_size, action_size,
                                                 hidden_layers).to(self.device)

        # Target sequence initiation
        hard_update(self.target_double_critic, self.double_critic)

        # Optimization sequence initiation.
        self.target_entropy = -action_size
        self.alpha_lr = kwargs.get("alpha_lr")
        alpha_init = kwargs.get("alpha", alpha)
        self.log_alpha = torch.tensor(np.log(alpha_init),
                                      device=self.device,
                                      requires_grad=True)

        self.actor_params = list(self.actor.parameters()) + [self.policy.std]
        self.critic_params = list(self.double_critic.parameters())
        self.actor_optimizer = optim.Adam(self.actor_params, lr=actor_lr)
        self.critic_optimizer = optim.Adam(list(self.critic_params),
                                           lr=critic_lr)
        if self.alpha_lr is not None:
            self.alpha_optimizer = optim.Adam([self.log_alpha],
                                              lr=self.alpha_lr)
        self.action_min = clip[0]
        self.action_max = clip[1]
        self.action_scale = kwargs.get('action_scale', 1)
        self.max_grad_norm_alpha: float = float(
            kwargs.get("max_grad_norm_alpha", 1.0))
        self.max_grad_norm_actor: float = float(
            kwargs.get("max_grad_norm_actor", 20.0))
        self.max_grad_norm_critic: float = float(
            kwargs.get("max_grad_norm_critic", 20.0))

        self.gamma: float = float(kwargs.get('gamma', 0.99))
        self.tau: float = float(kwargs.get('tau', 0.02))
        self.batch_size: int = int(kwargs.get('batch_size', 64))
        self.buffer_size: int = int(kwargs.get('buffer_size', int(1e6)))
        self.memory = Buffer(self.batch_size, self.buffer_size)

        self.warm_up: int = int(kwargs.get('warm_up', 0))
        self.update_freq: int = int(kwargs.get('update_freq', 1))
        self.number_updates: int = int(kwargs.get('number_updates', 1))

        # Breath, my child.
        self.reset_agent()
        self.iteration = 0

        self.actor_loss = np.nan
        self.critic_loss = np.nan

    @property
    def alpha(self):
        return self.log_alpha.exp()

    def reset_agent(self) -> None:
        self.actor.reset_parameters()
        self.double_critic.reset_parameters()
        self.target_double_critic.reset_parameters()

    def describe_agent(self) -> Sequence[dict]:
        """
        Returns network's weights in order:
        Actor, TargetActor, Critic, TargetCritic
        """
        return (self.actor.state_dict(), self.double_critic.state_dict(),
                self.target_double_critic.state_dict())

    def act(self, state, epsilon: float = 0.0, deterministic=False):
        if np.random.random() < epsilon:
            return np.clip(
                self.action_scale * np.random.random(size=self.action_size),
                self.action_min, self.action_max)

        with torch.no_grad():
            state = torch.tensor(state.reshape(1, -1).astype(np.float32)).to(
                self.device)
            action_mu = self.actor.act(state.detach())

            if deterministic:
                action = action_mu
            else:
                action = self.policy(action_mu).sample()

            action = action.cpu().numpy().flatten()
            return np.clip(action * self.action_scale, self.action_min,
                           self.action_max)

    def step(self, state, action, reward, next_state, done):
        self.iteration += 1
        self.memory.add(
            state=state,
            action=action,
            reward=reward,
            next_state=next_state,
            done=done,
        )

        if self.iteration < self.warm_up:
            return

        if len(self.memory) > self.batch_size and (self.iteration %
                                                   self.update_freq) == 0:
            for _ in range(self.number_updates):
                self.learn(self.memory.sample())

    def _update_value_function(self, states, actions, rewards, next_states,
                               dones):
        # critic loss
        action_mu = self.actor(next_states)
        dist = self.policy(action_mu)
        next_actions = dist.rsample()
        # log_prob = dist.log_prob(next_actions).sum(-1, keepdim=True)
        log_prob = dist.log_prob(next_actions).unsqueeze(1)

        with torch.no_grad():
            Q_target_next, Q2_target_next = self.double_critic.act(
                next_states, next_actions)
            V_target = torch.min(Q_target_next,
                                 Q2_target_next) - self.alpha * log_prob
            Q_target = rewards + self.gamma * V_target * (1 - dones)
            Q_target = Q_target.type(torch.float32)

        Q_expected, Q2_expected = self.double_critic(states, actions)
        critic_loss = mse_loss(Q_expected, Q_target) + mse_loss(
            Q2_expected, Q_target)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_params, self.max_grad_norm_critic)
        self.critic_optimizer.step()
        self.critic_loss = critic_loss.item()

    def _update_policy(self, states):
        # Compute actor loss
        actions_mu = self.actor(states)
        dist = self.policy(actions_mu)
        pred_actions = dist.rsample()
        log_prob = dist.log_prob(pred_actions).unsqueeze(1)

        Q_actor = torch.min(*self.double_critic(states, pred_actions))
        actor_loss = (self.alpha * log_prob - Q_actor).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        clip_grad_norm_(self.actor_params, self.max_grad_norm_actor)
        self.actor_optimizer.step()
        self.actor_loss = actor_loss.item()

        # Update alpha
        if self.alpha_lr is not None:
            self.alpha_optimizer.zero_grad()
            alpha_loss = (self.alpha *
                          (-log_prob - self.target_entropy).detach()).mean()
            alpha_loss.backward()
            clip_grad_norm_(self.log_alpha, self.max_grad_norm_alpha)
            self.alpha_optimizer.step()

    def learn(self, samples):
        """update the critics and actors of all the agents """

        rewards = torch.tensor(samples['reward'],
                               device=self.device).unsqueeze(1)
        dones = torch.tensor(samples['done'],
                             dtype=torch.int,
                             device=self.device).unsqueeze(1)
        states = torch.tensor(samples['state'],
                              dtype=torch.float32,
                              device=self.device)
        next_states = torch.tensor(samples['next_state'],
                                   dtype=torch.float32,
                                   device=self.device)
        actions = torch.tensor(samples['action'],
                               dtype=torch.float32,
                               device=self.device)

        self._update_value_function(states, actions, rewards, next_states,
                                    dones)
        self._update_policy(states)

        soft_update(self.target_double_critic, self.double_critic, self.tau)

    def log_writer(self, writer, episode):
        writer.add_scalar("loss/actor", self.actor_loss, episode)
        writer.add_scalar("loss/critic", self.critic_loss, episode)
        writer.add_scalar("loss/alpha", self.alpha, episode)

        for idx, std in enumerate(self.policy.std):
            writer.add_scalar(f"policy/std_{idx}", std, episode)

    def save_state(self, path: str):
        agent_state = dict(
            actor=self.actor.state_dict(),
            double_critic=self.double_critic.state_dict(),
            target_double_critic=self.target_double_critic.state_dict(),
        )
        torch.save(agent_state, path)

    def load_state(self, path: str):
        agent_state = torch.load(path)
        self.actor.load_state_dict(agent_state['actor'])
        self.double_critic.load_state_dict(agent_state['double_critic'])
        self.target_double_critic.load_state_dict(
            agent_state['target_double_critic'])
Exemple #15
0
class PPOAgent(AgentType):

    name = "PPO"

    def __init__(self, state_size: int, action_size: int, hidden_layers=(300, 200), config=None, device=None, **kwargs):
        config = config if config is not None else {}
        self.device = device if device is not None else DEVICE

        self.state_size = state_size
        self.action_size = action_size
        self.iteration = 0

        self.actor_lr = float(config.get('actor_lr', 3e-4))
        self.critic_lr = float(config.get('critic_lr', 1e-3))
        self.gamma: float = float(config.get("gamma", 0.99))
        self.ppo_ratio_clip: float = float(config.get("ppo_ratio_clip", 0.2))

        self.rollout_length: int = int(config.get("rollout_length", 48))  # "Much less than the episode length"
        self.batch_size: int = int(config.get("batch_size", self.rollout_length // 2))
        self.number_updates: int = int(config.get("number_updates", 5))
        self.entropy_weight: float = float(config.get("entropy_weight", 0.0005))
        self.value_loss_weight: float = float(config.get("value_loss_weight", 1.0))

        self.local_memory_buffer = {}
        self.memory = ReplayBuffer(batch_size=self.batch_size, buffer_size=self.rollout_length)

        self.action_scale: float = float(config.get("action_scale", 1))
        self.action_min: float = float(config.get("action_min", -2))
        self.action_max: float = float(config.get("action_max", 2))
        self.max_grad_norm_actor: float = float(config.get("max_grad_norm_actor", 100.0))
        self.max_grad_norm_critic: float = float(config.get("max_grad_norm_critic", 100.0))

        self.hidden_layers = config.get('hidden_layers', hidden_layers)
        self.actor = ActorBody(state_size, action_size, self.hidden_layers).to(self.device)
        self.critic = CriticBody(state_size, action_size, self.hidden_layers).to(self.device)
        self.policy = GaussianPolicy(action_size).to(self.device)

        self.actor_params = list(self.actor.parameters()) + [self.policy.std]
        self.critic_params = self.critic.parameters()
        self.actor_opt = torch.optim.SGD(self.actor_params, lr=self.actor_lr)
        self.critic_opt = torch.optim.SGD(self.critic_params, lr=self.critic_lr)

    def __clear_memory(self):
        self.memory = ReplayBuffer(batch_size=self.batch_size, buffer_size=self.rollout_length)

    def act(self, state, noise=0):
        with torch.no_grad():
            state = torch.tensor(state.reshape(1, -1).astype(np.float32)).to(self.device)
            action_mu = self.actor(state)
            value = self.critic(state, action_mu)

            dist = self.policy(action_mu)
            action = dist.sample()
            logprob = dist.log_prob(action)

            self.local_memory_buffer['value'] = value
            self.local_memory_buffer['logprob'] = logprob

            action = action.cpu().numpy().flatten()
            return np.clip(action*self.action_scale, self.action_min, self.action_max)

    def step(self, states, actions, rewards, next_state, done, **kwargs):
        self.iteration += 1

        self.memory.add(
            state=states, action=actions, reward=rewards, done=done,
            logprob=self.local_memory_buffer['logprob'], value=self.local_memory_buffer['value']
        )

        if self.iteration % self.rollout_length == 0:
            self.update()
            self.__clear_memory()

    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        all_indices = np.arange(self.batch_size)
        for _ in range(self.batch_size // mini_batch_size):
            rand_ids = np.random.choice(all_indices, mini_batch_size, replace=False)
            yield states[rand_ids], actions[rand_ids], log_probs[rand_ids], returns[rand_ids], advantage[rand_ids]

    def _unpack_experiences(self, experiences):
        unpacked_experiences = defaultdict(lambda: [])
        for experience in experiences:
            unpacked_experiences['rewards'].append(experience.reward)
            unpacked_experiences['dones'].append(experience.done)
            unpacked_experiences['values'].append(experience.value)
            unpacked_experiences['states'].append(experience.state)
            unpacked_experiences['actions'].append(experience.action)
            unpacked_experiences['logprobs'].append(experience.logprob)

        return unpacked_experiences

    def update(self):
        experiences = self.memory.sample()
        rewards = torch.tensor(experiences['reward']).to(self.device)
        dones = torch.tensor(experiences['done']).type(torch.int).to(self.device)
        states = torch.tensor(experiences['state']).to(self.device)
        actions = torch.tensor(experiences['action']).to(self.device)
        values = torch.cat(experiences['value'])
        log_probs = torch.cat(experiences['logprob'])

        returns = revert_norm_returns(rewards, dones, self.gamma, device=self.device).unsqueeze(1)
        advantages = returns - values

        for _ in range(self.number_updates):
            for samples in self.ppo_iter(self.batch_size, states, actions, log_probs, returns, advantages):
                self.learn(samples)

    def learn(self, samples):
        state, action, old_log_probs, return_, advantage = samples

        action_mu = self.actor(state.detach())
        dist = self.policy(action_mu)
        value = self.critic(state.detach(), action_mu.detach())

        entropy = dist.entropy()
        new_log_probs = dist.log_prob(action.detach())

        r_theta = (new_log_probs - old_log_probs).exp()
        r_theta_clip = torch.clamp(r_theta, 1.0 - self.ppo_ratio_clip, 1.0 + self.ppo_ratio_clip)

        policy_loss = -torch.min(r_theta * advantage, r_theta_clip * advantage).mean()
        entropy_loss = -self.entropy_weight * entropy.mean()
        actor_loss = policy_loss + entropy_loss

        self.actor_opt.zero_grad()
        actor_loss.backward()
        nn.utils.clip_grad_norm_(self.actor_params, self.max_grad_norm_actor)
        self.actor_opt.step()
        self.actor_loss = actor_loss.item()
        # loss = policy_loss + value_loss + entropy_loss

        value_loss = self.value_loss_weight * 0.5 * (return_ - value).pow(2).mean()

        self.critic_opt.zero_grad()
        value_loss.backward()
        nn.utils.clip_grad_norm_(self.critic_params, self.max_grad_norm_critic)
        self.critic_opt.step()
        self.critic_loss = value_loss.mean().item()

    def log_writer(self, writer, episode):
        writer.add_scalar("loss/actor", self.actor_loss, episode)
        writer.add_scalar("loss/critic", self.critic_loss, episode)

    def save_state(self, path: str):
        agent_state = dict(policy=self.policy.state_dict())
        torch.save(agent_state, path)

    def load_state(self, path: str):
        agent_state = torch.load(path)
        self.policy.load_state_dict(agent_state['policy'])