class MADDPGAgent():
    def __init__(self, seed, checkpoint_filename=None):

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed)
        self.t = 0

        self.agents = [
            DDPGAgent(index, NUM_AGENTS, seed, DEVICE)
            for index in range(NUM_AGENTS)
        ]

        if checkpoint_filename:
            for i, to_load in enumerate(self.agents):
                f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights"
                actor_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights",
                    map_location=DEVICE)
                critic_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights",
                    map_location=DEVICE)
                to_load.actor_local.load_state_dict(actor_file)
                to_load.actor_target.load_state_dict(actor_file)
                to_load.critic_local.load_state_dict(critic_file)
                to_load.critic_target.load_state_dict(critic_file)
            print(f'Files loaded with prefix {checkpoint_filename}')

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)
        self.t = (self.t + 1) % UPDATE_FREQUENCY
        if self.t == 0 and (len(self.memory) > BATCH_SIZE):
            experiences = [self.memory.sample() for _ in range(NUM_AGENTS)]
            self.learn(experiences, GAMMA)

    def act(self, all_states, random):
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, random=random)
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        all_actions = []
        all_next_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(DEVICE)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            all_actions.append(agent.actor_local(state))
            all_next_actions.append(agent.actor_target(next_state))
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)
Ejemplo n.º 2
0
class MADDPG():
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """ Initialize multiple Agents each with a Actor-Critic network
            but they share the replay buffer to learn from experience
        """
        self.num_agents = num_agents
        self.agents = []
        for _ in range(num_agents):
            agent = Agent(state_size, action_size, random_seed)
            self.agents.append(agent)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, add_noise=True):
        clipped_actions = []
        for state, agent in zip(states, self.agents):
            clipped_actions.append(agent.act(state, add_noise))
        return clipped_actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def learn(self, experiences, gamma):
        for agent in self.agents:
            agent.learn(experiences, gamma)

    def saveCheckPoints(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoints/actor_agent_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoints/critic_agent_{i}.pth")

    def loadCheckPoints(self):
        for i, agent in enumerate(self.agents):
            agent.actor_local.load_state_dict(
                torch.load(f"checkpoints/actor_agent_{i}.pth"))
            agent.critic_local.load_state_dict(
                torch.load(f"checkpoints/critic_agent_{i}.pth"))

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for agent in self.agents:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)
Ejemplo n.º 3
0
class MADDPG:
    def __init__(self, num_agents=2, random_seed=1):  #np.random.randint(1000)
        super(MADDPG, self).__init__()

        self.maddpg_agent = [
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed),
            DDPGAgent(24, 16, 8, 2, 52, 42, 24, random_seed)
        ]

        self.num_agents = num_agents

        # Replay memory
        action_size = 2
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, obs_all_agents, noise_ampl=1):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(obs, noise_ampl)
            for agent, obs in zip(self.maddpg_agent, obs_all_agents)
        ]
        return actions

    def add_memory(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.num_agents = self.num_agents
        self.memory.add(state, action, reward, next_state, done)

    def step(self):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:

            for n in range(0, self.num_agents):

                experiences = self.memory.sample()

                self.maddpg_agent[n].step(experiences)

    def reset(self):
        for n in range(0, self.num_agents):
            self.maddpg_agent[n].reset()
class MultiAgent:
    """Interacts with and learns from the environment."""
    def __init__(self, agent_count, state_size, action_size, random_seed):
        """Initialize a MultiAgent object.

        Params
        ======
            agent_count (int): Number of agents
        """

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.agents = [
            Agent(
                memory=self.memory,
                state_size=state_size,
                action_size=action_size,
                random_seed=random_seed,
            ) for _ in range(agent_count)
        ]

    def step(self, states, actions, rewards, next_states, dones, timestep):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for agent in self.agents:
                agent.learn(self.memory.sample(), GAMMA)

    def act(self, all_states):
        """Get actions from all agents"""
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.agents, all_states)
        ]
        return actions

    def reset(self):
        for agent in self.agents:
            agent.reset()
Ejemplo n.º 5
0
class MultiAgent:
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.agents = [
            DDPGAgent(state_size, action_size, random_seed)
            for _ in range(num_agents)
        ]
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)
        self.t_step = 0

    def step_all(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for agent in self.agents:
                    experiences = self.memory.sample()
                    agent.learn(experiences, GAMMA)

    def act_all(self, multi_states):
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.agents, multi_states)
        ]
        return actions

    def save_weights_all(self):
        for index, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       'agent{}_checkpoint_actor.pth'.format(index + 1))
            torch.save(agent.critic_local.state_dict(),
                       'agent{}_checkpoint_critic.pth'.format(index + 1))

    def reset_all(self):
        for agent in self.agents:
            agent.reset()
class MultiAgent:
    def __init__(self, config):

        self.random_seeds = config['random_seeds']
        self.params = config['params']
        self.memory = ReplayBuffer(self.params['action_size'],
                                   self.params['buffer_size'],
                                   self.params['batch_size'], device,
                                   self.random_seeds[0])
        self.params['memory'] = self.memory

        self.ddpg_agents = [
            Agent(self.params, self.random_seeds[i]) for i in range(2)
        ]

        self.t_step = 0

    def act(self, states):
        actions = [
            agent.act(np.expand_dims(state, axis=0))
            for agent, state in zip(self.ddpg_agents, states)
        ]
        #actions = [agent.act(states) for agent in self.ddpg_agents]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        self.t_step += 1

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        if (len(self.memory) > self.params['batch_size']) and (
                self.t_step % self.params['num_steps_per_update'] == 0):
            for agent in self.ddpg_agents:
                experiences = self.memory.sample()
                agent.learn(experiences, self.params['gamma'])

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()
Ejemplo n.º 7
0
class MADDPG(object):
    """
    The main class that defines and trains all the DDPG agents.
    """
    def __init__(
        self,
        num_agents,
        state_size,
        action_size,
        buffer_size=int(1e6),
        batch_size=128,
        writer=None,
        actor_hidden_sizes=(256, 128),
        actor_lr=1e-4,
        actor_weight_decay=0.,
        critic_hidden_sizes=(256, 128),
        critic_lr=1e-3,
        critic_weight_decay=0.,
        model_folder_path=None,
    ):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size

        self.full_state_size = num_agents * state_size
        self.full_action_size = num_agents * action_size

        # Replay memory
        self.memory = ReplayBuffer(buffer_size)

        # TensorboardX Writer
        self.writer = writer

        # Actor Network Parameters
        self.actor_hidden_sizes = actor_hidden_sizes
        self.actor_lr = actor_lr
        self.actor_weight_decay = actor_weight_decay

        # Critic Network Parameters
        self.critic_hidden_sizes = critic_hidden_sizes
        self.critic_lr = critic_lr
        self.critic_weight_decay = critic_weight_decay

        # Model Folder
        self.folder_path = Path() if model_folder_path is None else Path(
            model_folder_path)

        # MADDPG Agents
        self.agents = []
        self._init_agents()

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, noise=0.):
        return [
            agent.act(obs, noise) for agent, obs in zip(self.agents, states)
        ]

    def step(self,
             i_episode,
             states,
             actions,
             rewards,
             next_states,
             dones,
             tau=0.01,
             num_learns=1):

        # save to replay buffer
        self.memory.add(states, actions, rewards, next_states, dones)

        # train the model
        if len(self.memory) >= self.batch_size and num_learns > 0:
            actor_loss_list, critic_loss_list = [], []

            for _ in range(num_learns):  # learn multiple times at every step
                states, actions, rewards, next_states, dones = self.memory.sample(
                    self.batch_size)

                for agent_id in range(self.num_agents):
                    # Learn one time for the agents
                    actor_loss, critic_loss = self._learn(
                        agent_id, states, actions, next_states, rewards, dones)

                    actor_loss_list.append(actor_loss)
                    critic_loss_list.append(critic_loss)

            # Record Losses for actor & critic
            if self.writer:
                for agent_id in range(self.num_agents):
                    self.writer.add_scalars(
                        f'agent{agent_id}/losses', {
                            'critic loss': np.mean(critic_loss_list),
                            'actor_loss': np.mean(actor_loss_list)
                        }, i_episode)

            # Soft update
            self._update_all(tau)

    def save(self):
        for agent in self.agents:
            torch.save(
                agent.actor_local.state_dict(),
                self.folder_path / f'checkpoint_actor_local_{agent.id}.pth')
            torch.save(
                agent.critic_local.state_dict(),
                self.folder_path / f'checkpoint_critic_local_{agent.id}.pth')

    def load(self, agent_id=None):
        for agent in self.agents:
            agent_id_ = agent.id if agent_id is None else agent_id
            agent.actor_local.load_state_dict(
                torch.load(self.folder_path /
                           f'checkpoint_actor_local_{agent_id_}.pth'))
            agent.critic_local.load_state_dict(
                torch.load(self.folder_path /
                           f'checkpoint_critic_local_{agent_id_}.pth'))

    def _init_agents(self):
        for i in range(self.num_agents):
            agent = DDPG(i, self.state_size, self.full_state_size,
                         self.action_size, self.full_action_size,
                         self.actor_hidden_sizes, self.actor_lr,
                         self.actor_weight_decay, self.critic_hidden_sizes,
                         self.critic_lr, self.critic_weight_decay)
            self.agents.append(agent)

    def _learn(self, agent_id, states, actions, next_states, rewards, dones):

        critic_full_actions, critic_full_next_actions = [], []
        for agent in self.agents:
            # current actions
            actor_actions = agent.actor_local(states[:, agent.id, :])
            critic_full_actions.append(actor_actions)

            # next actions
            actor_next_actions = agent.actor_target.forward(
                next_states[:, agent.id, :])
            critic_full_next_actions.append(actor_next_actions)

        # learn for the agent
        current_agent = self.agents[agent_id]
        actor_loss, critic_loss = current_agent.learn(
            states, actions, rewards, next_states, dones, critic_full_actions,
            critic_full_next_actions)
        return actor_loss, critic_loss

    def _update_all(self, tau):
        for agent in self.agents:
            agent.update(agent.actor_local, agent.actor_target, tau)
            agent.update(agent.critic_local, agent.critic_target, tau)
Ejemplo n.º 8
0
def main(env_name, num_actors, num_iters, logdir, cluster):

    logdir = pathlib.Path(logdir)
    if logdir.exists():
        shutil.rmtree(logdir)

    summary_writer = tf.summary.create_file_writer(str(logdir))

    if not cluster:
        ray.init()

    epsilons = np.linspace(0.01, 0.8, num_actors)

    print("==== ACTORS launch ====")
    actors = [
        Actor.remote(pid=i, env_name=env_name, epsilon=epsilons[i])
        for i in range(num_actors)
    ]

    replaybuffer = ReplayBuffer(buffer_size=2**15)

    print("==== LEARNER launch ====")
    learner = Learner.remote(env_name=env_name)

    current_weights = ray.put(ray.get(learner.get_weights.remote()))

    print("==== TESTER launch ====")
    tester = Actor.remote(pid=None, env_name=env_name, epsilon=0.0)

    wip_actors = [actor.rollout.remote(current_weights) for actor in actors]

    n = 0

    print("==== Initialize buffer ====")
    for _ in range(50):
        finished_actor, wip_actors = ray.wait(wip_actors, num_returns=1)
        td_errors, transitions, pid = ray.get(finished_actor[0])
        replaybuffer.add(td_errors, transitions)
        wip_actors.extend([actors[pid].rollout.remote(current_weights)])
        n += 1

    minibatchs = [
        replaybuffer.sample_minibatch(batch_size=512) for _ in range(16)
    ]

    wip_learner = learner.update_network.remote(minibatchs)

    minibatchs = [
        replaybuffer.sample_minibatch(batch_size=512) for _ in range(16)
    ]

    wip_tester = tester.test_play.remote(current_weights)

    t = time.time()
    lap_count = 0

    while n <= num_iters:

        finished_actor, wip_actors = ray.wait(wip_actors,
                                              num_returns=1,
                                              timeout=0)

        if finished_actor:
            td_errors, transitions, pid = ray.get(finished_actor[0])
            replaybuffer.add(td_errors, transitions)
            wip_actors.extend([actors[pid].rollout.remote(current_weights)])
            n += 1
            lap_count += 1

        finished_learner, _ = ray.wait([wip_learner], num_returns=1, timeout=0)

        if finished_learner:

            current_weights, indices, td_errors, loss_info = ray.get(
                finished_learner[0])

            wip_learner = learner.update_network.remote(minibatchs)

            current_weights = ray.put(current_weights)

            replaybuffer.update_priority(indices, td_errors)

            minibatchs = [
                replaybuffer.sample_minibatch(batch_size=512)
                for _ in range(16)
            ]

            with summary_writer.as_default():
                tf.summary.scalar("Buffer", len(replaybuffer), step=n)
                tf.summary.scalar("loss", loss_info, step=n)
                tf.summary.scalar("lap_count", lap_count, step=n)
                tf.summary.scalar("lap_time", time.time() - t, step=n)

            t = time.time()
            lap_count = 0

        if n % 200 == 0:
            test_score = ray.get(wip_tester)
            wip_tester = tester.test_play.remote(current_weights)
            with summary_writer.as_default():
                tf.summary.scalar("test_score", test_score, step=n)
Ejemplo n.º 9
0
class Agent():
    """ Class implementation of a so-called "intelligent" agent.
        This agent interacts with and learns from the environment.
    """

    double_dqn = False
    """ True for the Double-DQN method.
    """

    dueling_network = False
    """ True for the Dueling Network (DN) method.
    """

    prioritized_replay = False
    """ True for the Prioritized Replay memory buffer.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=9999e-4,
                 double_dqn=False,
                 dueling_network=False,
                 prioritized_replay=False):
        """ Initialize an Agent instance.
        
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            lr_decay (float): Multiplicative factor of learning rate decay
            double_dqn (bool): Toogle for using the Double-DQN method
            dueling_network (bool): Toogle for using the Dueling Network (DN) method
            prioritized_replay (bool): Toogle for using the Prioritized Replay method
        """

        # Set the parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.dueling_network = dueling_network
        self.prioritized_replay = prioritized_replay

        # Q-Network hidden layers.
        hidden_layers = [128, 32]

        # Use the Dueling Network (DN) method.
        if self.dueling_network:

            # DN requires a hidden state value.
            hidden_state_value = [64, 32]

            self.qnetwork_local = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target.eval()

        else:  # Use the Deep Q-Network (DQN) method.

            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           hidden_layers).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            hidden_layers).to(device)
            self.qnetwork_target.eval()

        # Optimize using Adam.
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=LEARNING_RATE)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Use the Prioritized Replay memory buffer if enabled.
        if self.prioritized_replay:

            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)

        else:  # Use the Replay memory buffer instead.
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize the time step (until the THRESHOLD is reached).
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """ Update the network on each step.

        Params
        ======
            state (array_like): Current state
        """

        # Save experience in replay memory.
        self.memory.add(state, action, reward, next_state, done)

        # Learn every time step till THRESHOLD.
        self.t_step = (self.t_step + 1) % THRESHOLD

        if self.t_step == 0:  # Initial time step.

            # If enough samples are available in memory, get random subset and learn.
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """ Return the actions for a given state as per current policy.
        
        Params
        ======
            state (array_like): Current state
            eps (float): Epsilon (ε), for epsilon-greedy action selection
        """

        # Epsilon-greedy action selection.
        if random.random() > eps:

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            self.qnetwork_local.eval()

            with torch.no_grad():
                action_values = self.qnetwork_local(state)

            # Train the network.
            self.qnetwork_local.train()

            # Return the action.
            return np.argmax(action_values.cpu().data.numpy())

        else:  # Return a random action.
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples 
            gamma (float): Discount factor
        """

        # Set the parameters.
        states, actions, rewards, next_states, dones, w = experiences

        # Compute and minimize the loss.
        with torch.no_grad():

            if self.double_dqn:  # Use of Double-DQN method.

                # Select the greedy actions using the QNetwork Local.
                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards_local = self.qnetwork_local(next_states)

                # Select the action with the maximum reward for each of the next actions.
                greedy_actions_local = next_action_rewards_local.max(
                    dim=1, keepdim=True)[1]

                ## Get the rewards for the greedy actions using the QNetwork Target.
                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards_target = self.qnetwork_target(next_states)

                # Get the target reward for each of the greedy actions selected,
                # following the local network.
                target_rewards = next_action_rewards_target.gather(
                    1, greedy_actions_local)

            else:  # Use of the fixed Q-target method.

                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards = self.qnetwork_target(next_states)

                # Select the maximum reward for each of the next actions.
                target_rewards = next_action_rewards.max(dim=1,
                                                         keepdim=True)[0]

            # Calculate the discounted target rewards.
            target_rewards = rewards + (gamma * target_rewards * (1 - dones))

        # Calculate the pair action/rewards for each of the states.
        # Here, shape: [batch_size, action_size].
        expected_action_rewards = self.qnetwork_local(states)

        # Get the reward for each of the actions.
        # Here, shape: [batch_size, 1].
        expected_rewards = expected_action_rewards.gather(1, actions)

        # If the Prioritized Replay memory buffer if enabled.
        if self.prioritized_replay:
            target_rewards.sub_(expected_rewards)
            target_rewards.squeeze_()
            target_rewards.pow_(2)

            with torch.no_grad():
                td_error = target_rewards.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            target_rewards.mul_(w)
            loss = target_rewards.mean()

        else:  # Calculate the loss.
            loss = F.mse_loss(expected_rewards, target_rewards)

        # Perform the back-propagation.
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # Update the target network.
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters:
            θ_target = τ * θ_local + (1 - τ) * θ_target.

        Params
        ======
            local_model (PyTorch model): Weights will be copied from
            target_model (PyTorch model): Weights will be copied to
            tau (float): Interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1. - tau) * target_param.data)
Ejemplo n.º 10
0
    return sum_reward / 10


obs = env.reset()
while setps < max_steps:

    p = agents.acting.predict(np.array([obs]))
    for i in range(n_ant):
        if setps < 10000:
            p[i] = 2 * np.random.rand(n_actions) - 1
        else:
            p[i] = np.clip(p[i][0] + 0.1 * np.random.randn(n_actions), -1, 1)
    next_obs, reward, terminated, info = env.step(np.hstack(p))
    setps += 1
    ep_len += 1
    buff.add(obs, p, reward, next_obs, terminated)
    obs = next_obs

    if (terminated) | (ep_len == max_ep_len):
        obs = env.reset()
        terminated = False
        ep_len = 0

    if setps % 10000 == 0:
        print(test_agent())

    if (setps < 1000) | (setps % 50 != 0):
        continue

    for e in range(50):
        batch = buff.getBatch(batch_size)
Ejemplo n.º 11
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, memory=None, random_seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        if memory is not None:
            self.memory = memory
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        if add_noise:
            action += self.noise.sample()
        self.actor_local.train()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 12
0
def train_model(context, data, training_batch):
    # Crear clase de datos sintéticos
    context.synthetic_data = SyntheticData(context=context,
                                           data=data,
                                           window=10000,
                                           frequency=30)

    # Crear configuración del modelo junto con redes neuronales
    create_model(context)

    # Configurar resumen de operaciones
    summary_ops, summary_vars = build_summaries()
    writer = tf.summary.FileWriter(
        "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries",
        context.sess.graph)

    if os.path.exists(context.model_path):
        context.saver.restore(context.sess, context.model_path)

    # Inicializar la memoria de repetición
    replay_buffer = ReplayBuffer(context.buffer_size)
    for episode in range(context.max_episodes):
        data, close_prices = context.synthetic_data.get_trayectory(
            t_intervals=context.max_ep_steps + context.n)

        # Resetear los valores del portafolio al inicio de cada episodio
        context.portfolio_value_memory = []
        context.portfolio_value_memory.append(context.init_train_portfolio)
        context.train_invested_quantity = 0.0
        context.assets_quantity_invested = []
        context.portfolio_w_memory = []
        context.init_portfolio_w = []
        for i in range(len(context.assets) + 1):
            context.init_portfolio_w.append(0.0)
        context.portfolio_w_memory.append(context.init_portfolio_w)
        for i in range(len(context.assets)):
            context.assets_quantity_invested.append(0.0)
        context.train_cash = context.init_train_portfolio
        context.last_train_operation = 2
        context.open_trade = False

        ep_reward = 0
        ep_ave_max_q = 0
        ep_loss = 0

        # Se resta uno para tomar el cuenta la obtención del siguiente estado
        for i in range(context.max_ep_steps - 1):
            # Obtener el estado
            s = data[:, i:i + context.n, :]

            # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración
            random = np.random.rand()
            if random > context.epsilon:
                if s.shape == (len(context.assets), context.n,
                               len(context.features)):
                    a = context.actor.predict([s])[0]
                else:
                    print("Episodio:", episode, "Paso:", i,
                          "La forma del estado actual es incorrecta")
                    continue
            else:
                rand_array = np.random.rand(len(context.assets) + 1)
                a = np.exp(rand_array) / np.sum(np.exp(rand_array))
            context.epsilon = context.epsilon * context.epsilon_decay

            # Siguiente estado
            s2 = data[:, i + 1:i + 1 + context.n, :]
            if not s2.shape == (len(
                    context.assets), context.n, len(context.features)):
                print("Episodio:", episode, "Paso:", i,
                      "La forma del siguiente estado es incorrecta")
                continue

            # Recompensa
            this_closes = close_prices[:, i + context.n]
            previous_closes = close_prices[:, i + context.n - 1]

            r = get_reward(context, this_closes, previous_closes, a)

            # Punto terminal
            if i == (context.max_ep_steps - context.n - 2):
                t = True
            else:
                t = False

            replay_buffer.add(s, a, r, t, s2)

            if replay_buffer.size() > context.minibatch_size:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    context.minibatch_size)
                # Calcular objetivos
                target_q = context.critic.predict_target(
                    s2_batch, context.actor.predict_target(s2_batch))
                y_i = []

                for k in range(context.minibatch_size):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + context.gamma * target_q[k])

                # Actualizar el crítico dados los objetivos
                predicted_q_value_batch = np.reshape(
                    y_i, (context.minibatch_size, 1))
                predicted_q_value, losses, _ = context.critic.train(
                    s_batch, a_batch, predicted_q_value_batch)

                ep_loss += np.mean(losses)
                ep_ave_max_q += np.amax(predicted_q_value)

                # Actualizar la política del actor utilizando el ejemplar de gradiente
                a_outs = context.actor.predict(s_batch)
                grads = context.critic.action_gradients(s_batch, a_outs)
                context.actor.train(s_batch, grads[0])

                # Actualizar las redes objetivo
                context.actor.update_target_network()
                context.critic.update_target_network()

            ep_reward += r

            if i == (context.max_ep_steps - 2):
                summary_str = context.sess.run(summary_ops,
                                               feed_dict={
                                                   summary_vars[0]:
                                                   ep_reward,
                                                   summary_vars[1]:
                                                   ep_ave_max_q / float(i),
                                                   summary_vars[2]:
                                                   ep_loss / float(i)
                                               })

                writer.add_summary(summary_str, episode)
                writer.flush()

                print(
                    '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} '
                    .format(ep_reward, episode, (ep_ave_max_q / float(i)),
                            context.portfolio_value_memory[-1],
                            context.epsilon))

        _ = context.saver.save(context.sess, context.model_path)
Ejemplo n.º 13
0
class MADDPG:
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize MADDPG agent."""
        super(MADDPG, self).__init__()

        self.seed = random.seed(seed)

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \
                                 tau, lr_actor, lr_critic, weight_decay, seed) \
                                     for _ in range(num_agents)]

        self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)

    def act(self, states):
        actions = np.zeros([self.num_agents, self.action_size])
        for index, agent in enumerate(self.agents):
            actions[index, :] = agent.act(states[index])
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        """One step for MADDPG agent, include store the current transition and update parameters."""
        self.replay_buffer.add(states, actions, rewards, next_states, dones)

        if len(self.replay_buffer) > self.batch_size:
            '''
            experiences = self.replay_buffer.sample()
            states_list, _, _, _, _ = experiences
            next_actions_list = [self.agents[idx].target_actor(states).detach() \
                for idx, states in enumerate(states_list)]
            for i in range(self.num_agents):
                self.agents[i].step_learn(experiences, next_actions_list, i)
            '''
            for agent in self.agents:
                experiences = self.replay_buffer.sample()
                agent.step_learn(experiences)

    def save_weights(self):
        for index, agent in enumerate(self.agents):
            torch.save(
                agent.critic.state_dict(),
                'agent{}_critic_trained_with_DDPG.pth'.format(index + 1))
            torch.save(agent.actor.state_dict(),
                       'agent{}_actor_trained_with_DDPG.pth'.format(index + 1))

    def reset(self):
        for agent in self.agents:
            agent.reset()
Ejemplo n.º 14
0
    loss = []

    for step in range(max_steps + 1):
        # transition
        action = actor.get_action(observation, episode, mainQNet)
        next_observation, reward, done, _ = env.step(action)
        next_observation = np.reshape(next_observation, (1, input_size))

        # if terminal
        if done:
            next_observation = np.zeros_like(observation)
            if step < 195:  # failure
                reward = -1
            else:  #success
                reward = 1
            memory.add((observation, action, reward, next_observation))
            break
        else:
            reward = 0

        score += 1

        memory.add((observation, action, reward, next_observation))
        observation = next_observation

        if memory.length() > batch_size:
            loss_value = mainQNet.train(batch_size, gamma, memory, targetQNet)
            loss.append(loss_value)

    # record
    score_record.append(score)
Ejemplo n.º 15
0
class Agent():
    """ Class implementation of a so-called "intelligent" agent.
        This agent interacts with and learns from the environment.
        This agent employs the DDPG algorithm to solve this problem.
    """

    # actor_local = None
    # actor_target = None
    # actor_optimizer = None
    """ Class-level Actor properties.
    """

    # critic_local = None
    # critic_target = None
    # critic_optimizer = None
    """ Class-level Critic properties.
    """

    # memory = None
    """ Class-level memory variable.
    """
    def __init__(self, state_size, action_size, seed, add_noise=True):
        """ Initialize an Agent instance.
        
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            add_noise (bool): Toggle for using the stochastic process
        """

        # Set the parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Setting the Actor network (with the Target Network).
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)

        # Optimize the Actor using Adam.
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Setting the Critic network (with the Target Network).
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)

        # Optimize the Critic using Adam.
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Set up noise processing.
        if add_noise:
            self.noise = Noise((20, action_size), seed)

        # Use the Replay memory buffer (once per class).
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   device)

        # Initialize the time step (until max NUM_TIME_STEPS is reached).
        # self.t_step = 0

    def step(self, time_step, states, actions, rewards, next_states, dones):
        """ Update the network on each step.
            In other words, save the experience in replay memory,
            and then use random sampling from the buffer to learn.
        """

        # Save experience in replay memory.
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every time step till NUM_TIME_STEPS is reached.
        # if time_step % NUM_TIME_STEPS != 0:
        #     return

        # Save the experience in replay memory, then use random sampling from the buffer to learn.
        self.sample_and_learn()

    def sample_and_learn(self):
        """ For a specified number of agents,
            use random sampling from the buffer to learn.
        """

        # If enough samples are available in memory, get random subset and learn.
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

            # for _ in range(NUM_LEARN_UPDATES):
            #     experiences = Agent.memory.sample()
            #     self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """ Return the actions for a given state as per current policy.
        
        Params
        ======
            state (array_like): Current state
            add_noise (bool): Toggle for using the stochastic process
        """

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # If the stochastic process is enabled.
        if add_noise:
            action += self.noise.sample()

        # Return the action.
        return np.clip(action, -1, 1)

    def reset(self):
        """ Reset the state.
        """

        # Reset the internal state (noise) to mean (mu).
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.
            i.e.,
            Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
            where
                actor_target(state) -> action, and
                critic_target(state, action) -> Q-value.
        
        Params
        ======
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples 
            gamma (float): Discount factor
        """

        # Set the parameters.
        states, actions, rewards, next_states, dones = experiences
        """ Update the Critic.
        """
        # Get the predicted next-state actions and Q-values from the target models.
        # Calculate the pair action/reward for each of the next_states.
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q-targets for the current states, (y_i).
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute the Critic loss.
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss.
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        """ Update the Actor.
        """
        # Compute the Actor loss.
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss.
        self.actor_optimizer.zero_grad()
        # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        actor_loss.backward()
        self.actor_optimizer.step()
        """ Update the target networks.
        """
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters.
            i.e.,
            θ_target = τ * θ_local + (1 - τ) * θ_target.

        Params
        ======
            local_model (PyTorch model): Weights will be copied from
            target_model (PyTorch model): Weights will be copied to
            tau (float): Interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1. - tau) * target_param.data)
class MPOAgent:
    def __init__(self, env_id: str, logdir: Path):

        self.env_id = env_id

        self.summary_writer = tf.summary.create_file_writer(
            str(logdir)) if logdir else None

        self.action_space = gym.make(self.env_id).action_space.shape[0]

        self.replay_buffer = ReplayBuffer(maxlen=10000)

        self.policy = GaussianPolicyNetwork(action_space=self.action_space)
        self.target_policy = GaussianPolicyNetwork(
            action_space=self.action_space)

        self.critic = QNetwork()
        self.target_critic = QNetwork()

        self.log_temperature = tf.Variable(1.)

        self.log_alpha_mu = tf.Variable(1.)
        self.log_alpha_sigma = tf.Variable(1.)

        self.eps = 0.1

        self.eps_mu = 0.01
        self.eps_sigma = 0.001

        self.policy_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.critic_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.temperature_optimizer = tf.keras.optimizers.Adam(lr=0.0005)
        self.alpha_optimizer = tf.keras.optimizers.Adam(lr=0.0005)

        self.batch_size = 128

        self.n_samples = 10

        self.update_period = 4

        self.gamma = 0.99

        self.target_policy_update_period = 400

        self.target_critic_update_period = 400

        self.global_steps = 0

        self.episode_count = 0

        self.setup()

    def setup(self):
        """ Initialize network weights """

        env = gym.make(self.env_id)

        dummy_state = env.reset()
        dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32)

        dummy_action = np.random.normal(0, 0.1, size=self.action_space)
        dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32)

        self.policy(dummy_state)
        self.target_policy(dummy_state)

        self.critic(dummy_state, dummy_action)
        self.target_critic(dummy_state, dummy_action)

        self.target_policy.set_weights(self.policy.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

    def save(self, save_dir):
        save_dir = Path(save_dir)

        self.policy.save_weights(str(save_dir / "policy"))
        self.critic.save_weights(str(save_dir / "critic"))

    def load(self, load_dir=None):
        load_dir = Path(load_dir)

        self.policy.load_weights(str(load_dir / "policy"))
        self.target_policy.load_weights(str(load_dir / "policy"))

        self.critic.load_weights(str(load_dir / "critic"))
        self.target_critic.load_weights(str(load_dir / "critic"))

    def rollout(self):

        episode_rewards, episode_steps = 0, 0

        done = False

        env = gym.make(self.env_id)

        state = env.reset()

        while not done:

            action = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            try:
                next_state, reward, done, _ = env.step(action)
            except Exception as err:
                print(err)
                import pdb
                pdb.set_trace()

            #: Bipedalwalkerの転倒ペナルティ-100は大きすぎるためclip
            transition = Transition(state, action, np.clip(reward, -1., 1.),
                                    next_state, done)

            self.replay_buffer.add(transition)

            state = next_state

            episode_rewards += reward

            episode_steps += 1

            self.global_steps += 1

            if (len(self.replay_buffer) >= 5000
                    and self.global_steps % self.update_period == 0):
                self.update_networks()

            if self.global_steps % self.target_critic_update_period == 0:
                self.target_critic.set_weights(self.critic.get_weights())

            if self.global_steps % self.target_policy_update_period == 0:
                self.target_policy.set_weights(self.policy.get_weights())

        self.episode_count += 1
        with self.summary_writer.as_default():
            tf.summary.scalar("episode_reward_stp",
                              episode_rewards,
                              step=self.global_steps)
            tf.summary.scalar("episode_steps_stp",
                              episode_steps,
                              step=self.global_steps)
            tf.summary.scalar("episode_reward",
                              episode_rewards,
                              step=self.episode_count)
            tf.summary.scalar("episode_steps",
                              episode_steps,
                              step=self.episode_count)

        return episode_rewards, episode_steps

    def update_networks(self):

        (states, actions, rewards, next_states,
         dones) = self.replay_buffer.get_minibatch(batch_size=self.batch_size)

        B, M = self.batch_size, self.n_samples

        # [B, obs_dim] -> [B, obs_dim * M] -> [B * M, obs_dim]
        next_states_tiled = tf.reshape(tf.tile(next_states, multiples=(1, M)),
                                       shape=(B * M, -1))

        target_mu, target_sigma = self.target_policy(next_states_tiled)

        # For MultivariateGaussianPolicy
        #target_dist = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=target_sigma)

        # For IndependentGaussianPolicy
        target_dist = tfd.Independent(tfd.Normal(loc=target_mu,
                                                 scale=target_sigma),
                                      reinterpreted_batch_ndims=1)

        sampled_actions = target_dist.sample()  # [B * M,  action_dim]
        #sampled_actions = tf.clip_by_value(sampled_actions, -1.0, 1.0)

        # Update Q-network:
        sampled_qvalues = tf.reshape(self.target_critic(
            next_states_tiled, sampled_actions),
                                     shape=(B, M, -1))
        mean_qvalues = tf.reduce_mean(sampled_qvalues, axis=1)
        TQ = rewards + self.gamma * (1.0 - dones) * mean_qvalues

        with tf.GradientTape() as tape1:
            Q = self.critic(states, actions)
            loss_critic = tf.reduce_mean(tf.square(TQ - Q))

        variables = self.critic.trainable_variables
        grads = tape1.gradient(loss_critic, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.critic_optimizer.apply_gradients(zip(grads, variables))

        # E-step:
        # Obtain η* by minimising g(η)
        with tf.GradientTape() as tape2:
            temperature = tf.math.softplus(self.log_temperature)
            q_logsumexp = tf.math.reduce_logsumexp(sampled_qvalues /
                                                   temperature,
                                                   axis=1)
            loss_temperature = temperature * (
                self.eps + tf.reduce_mean(q_logsumexp, axis=0))

        grad = tape2.gradient(loss_temperature, self.log_temperature)
        if tf.math.is_nan(grad).numpy().sum() != 0:
            print("NAN GRAD in TEMPERATURE !!!!!!!!!")
            import pdb
            pdb.set_trace()
        else:
            self.temperature_optimizer.apply_gradients([
                (grad, self.log_temperature)
            ])

        # Obtain sample-based variational distribution q(a|s)
        temperature = tf.math.softplus(self.log_temperature)

        # M-step: Optimize the lower bound J with respect to θ
        weights = tf.squeeze(tf.math.softmax(sampled_qvalues / temperature,
                                             axis=1),
                             axis=2)  # [B, M, 1]

        if tf.math.is_nan(weights).numpy().sum() != 0:
            print("NAN in weights !!!!!!!!!")
            import pdb
            pdb.set_trace()

        with tf.GradientTape(persistent=True) as tape3:

            online_mu, online_sigma = self.policy(next_states_tiled)

            # For MultivariateGaussianPolicy
            #online_dist = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=online_sigma)

            # For IndependentGaussianPolicy
            online_dist = tfd.Independent(tfd.Normal(loc=online_mu,
                                                     scale=online_sigma),
                                          reinterpreted_batch_ndims=1)

            log_probs = tf.reshape(online_dist.log_prob(sampled_actions) +
                                   1e-6,
                                   shape=(B, M))  # [B * M, ] -> [B, M]

            cross_entropy_qp = tf.reduce_sum(weights * log_probs,
                                             axis=1)  # [B, M] -> [B,]

            # For MultivariateGaussianPolicy
            # online_dist_fixedmu = tfd.MultivariateNormalFullCovariance(loc=target_mu, covariance_matrix=online_sigma)
            # online_dist_fixedsigma = tfd.MultivariateNormalFullCovariance(loc=online_mu, covariance_matrix=target_sigma)

            # For IndependentGaussianPolicy
            online_dist_fixedmu = tfd.Independent(tfd.Normal(
                loc=target_mu, scale=online_sigma),
                                                  reinterpreted_batch_ndims=1)
            online_dist_fixedsigma = tfd.Independent(
                tfd.Normal(loc=online_mu, scale=target_sigma),
                reinterpreted_batch_ndims=1)

            kl_mu = tf.reshape(
                target_dist.kl_divergence(online_dist_fixedsigma),
                shape=(B, M))  # [B * M, ] -> [B, M]

            kl_sigma = tf.reshape(
                target_dist.kl_divergence(online_dist_fixedmu),
                shape=(B, M))  # [B * M, ] -> [B, M]

            alpha_mu = tf.math.softplus(self.log_alpha_mu)
            alpha_sigma = tf.math.softplus(self.log_alpha_sigma)

            loss_policy = -cross_entropy_qp  # [B,]
            loss_policy += tf.stop_gradient(alpha_mu) * tf.reduce_mean(kl_mu,
                                                                       axis=1)
            loss_policy += tf.stop_gradient(alpha_sigma) * tf.reduce_mean(
                kl_sigma, axis=1)

            loss_policy = tf.reduce_mean(loss_policy)  # [B,] -> [1]

            loss_alpha_mu = tf.reduce_mean(
                alpha_mu *
                tf.stop_gradient(self.eps_mu - tf.reduce_mean(kl_mu, axis=1)))

            loss_alpha_sigma = tf.reduce_mean(
                alpha_sigma *
                tf.stop_gradient(self.eps_sigma -
                                 tf.reduce_mean(kl_sigma, axis=1)))

            loss_alpha = loss_alpha_mu + loss_alpha_sigma

        variables = self.policy.trainable_variables
        grads = tape3.gradient(loss_policy, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.policy_optimizer.apply_gradients(zip(grads, variables))

        variables = [self.log_alpha_mu, self.log_alpha_sigma]
        grads = tape3.gradient(loss_alpha, variables)
        grads, _ = tf.clip_by_global_norm(grads, 40.)
        self.alpha_optimizer.apply_gradients(zip(grads, variables))

        del tape3

        with self.summary_writer.as_default():
            tf.summary.scalar("loss_policy",
                              loss_policy,
                              step=self.global_steps)
            tf.summary.scalar("loss_critic",
                              loss_critic,
                              step=self.global_steps)
            tf.summary.scalar("sigma",
                              tf.reduce_mean(online_sigma),
                              step=self.global_steps)
            tf.summary.scalar("kl_mu",
                              tf.reduce_mean(kl_mu),
                              step=self.global_steps)
            tf.summary.scalar("kl_sigma",
                              tf.reduce_mean(kl_sigma),
                              step=self.global_steps)
            tf.summary.scalar("temperature",
                              temperature,
                              step=self.global_steps)
            tf.summary.scalar("alpha_mu", alpha_mu, step=self.global_steps)
            tf.summary.scalar("alpha_sigma",
                              alpha_sigma,
                              step=self.global_steps)
            tf.summary.scalar("replay_buffer",
                              len(self.replay_buffer),
                              step=self.global_steps)

    def testplay(self, name, monitor_dir):

        total_rewards = []

        env = wrappers.RecordVideo(gym.make(self.env_id),
                                   video_folder=monitor_dir,
                                   step_trigger=lambda i: True,
                                   name_prefix=name)

        state = env.reset()

        done = False

        total_reward = 0

        while not done:

            action = self.policy.sample_action(np.atleast_2d(state))

            action = action.numpy()[0]

            next_state, reward, done, _ = env.step(action)

            total_reward += reward

            state = next_state

        total_rewards.append(total_reward)

        print(f"{name}", total_reward)
Ejemplo n.º 17
0
class Agent(object):
    def __init__(self, state_size, action_size, seed, config):
        self.state_size = state_size
        self.action_size = action_size
        self.config = config
        self.seed = random.seed(seed)

        self.local_q_net = QNetwork(state_size, action_size, seed).to(device)
        self.target_q_net = QNetwork(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.local_q_net.parameters(),
                                    lr=config["LR"])

        self.memory = ReplayBuffer(action_size, config["BUFFER_SIZE"],
                                   config["BATCH_SIZE"], seed)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.config["UPDATE_EVERY"]

        if self.t_step == 0:
            # if agent experienced enough
            if len(self.memory) > self.config["BATCH_SIZE"]:
                experiences = self.memory.sample()
                # Learn from previous experiences
                self.learn(experiences, self.config["GAMMA"])

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_q_net.eval()
        with torch.no_grad():
            action_values = self.local_q_net(state)
        self.local_q_net.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        # Double Q Learning

        states, actions, rewards, next_states, dones = experiences

        # Get next action estimation with local q network
        q_targets_next_expected = self.local_q_net(next_states).detach()
        q_targets_next_expected_actions = q_targets_next_expected.max(
            1)[1].unsqueeze(1)

        # Calculate Next Targets
        q_targets_next = self.target_q_net(next_states).gather(
            1, q_targets_next_expected_actions)

        # Non over-estimated targets
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Expected value
        q_expected = self.local_q_net(states).gather(1, actions)

        loss = torch.nn.functional.mse_loss(q_expected, q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.local_q_net, self.target_q_net,
                         self.config["TAU"])

    def soft_update(self, local_net, target_net, tau):
        for target_param, local_param in zip(target_net.parameters(),
                                             local_net.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
Ejemplo n.º 18
0
import gym
import torch

from buffer import ReplayBuffer
from model import Actor
gym.logger.set_level(40)

num_episode = 5

env = gym.make('Pendulum-v0')
buffer = ReplayBuffer(max_size=100)
actor = Actor(env.observation_space.shape[0], env.action_space.shape[0])

for e in range(num_episode):
    cumulative_reward = 0
    state = env.reset()
    for i in range(env.spec.max_episode_steps):
        action = actor(torch.FloatTensor(state)).detach().numpy()

        next_state, reward, done, info = env.step(action * env.action_space.high[0])
        buffer.add([state, next_state, reward, done])

        state = next_state

        cumulative_reward += reward
        
    print(f'Episode: {e:>3}, Reward: {cumulative_reward:>8.2f}')

print(len(buffer))

Ejemplo n.º 19
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_shape, action_size, seed, cnn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            cnn (bool): whether to use convolutional NN
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.cnn = cnn

        if cnn:
            self.qnetwork_local = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
        else:
            self.qnetwork_local = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        if self.cnn:
            state = torch.from_numpy(state).float().to(device)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        if self.cnn:
            n, x, y, c = states.shape
            states = states.reshape(n, c, x, y)

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(q_expected, q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 20
0
class MADDPG():
    def __init__(self, num_agents, state_size, action_size, random_seed):
        super(MADDPG, self).__init__()

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random_seed

        self.maddpg_agent = [
            Agent(self.state_size, self.action_size,
                  self.num_agents * self.state_size,
                  self.num_agents * self.action_size, self.random_seed)
            for i in range(self.num_agents)
        ]

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.noise_amplitud = 1
        self.noise_reduction = 0.9995
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        self.t_step += 1
        if len(self.memory) > BATCH_SIZE and self.t_step % UPDATE_EVERY == 0:
            # Learn, if enough samples are available in memory
            for _ in range(round(UPDATE_AMOUNT)):
                for agent in range(self.num_agents):
                    experiences = self.memory.sample()
                    self.learn(experiences, agent, GAMMA)
                self.update_targets()

    def act(self, states):
        """get actions from all agents in the MADDPG object"""
        if self.t_step < NOISE_START:
            noise_amplitud = 0
        else:
            noise_amplitud = self.noise_amplitud
            self.noise_amplitud = max(
                self.noise_amplitud * self.noise_reduction, 0.1)

        actions = np.array([
            agent.act(state, noise_amplitud)
            for agent, state in zip(self.maddpg_agent, states)
        ])

        return actions

    def target_actors(self, states):
        target_actions = torch.cat([
            agent.actor_target(states[:, i, :])
            for i, agent in enumerate(self.maddpg_agent)
        ],
                                   dim=1)
        return target_actions

    def actors(self, states):
        actions = torch.cat([
            agent.actor(states[:, i, :])
            for i, agent in enumerate(self.maddpg_agent)
        ],
                            dim=1)
        return actions

    def learn(self, experiences, agent_number, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        agent = self.maddpg_agent[agent_number]

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        target_actions_full = self.target_actors(next_states)
        next_states_full = next_states.view(-1,
                                            self.num_agents * self.state_size)
        #         target_critic_input = torch.cat((next_states_full,target_actions_full), dim = 1)

        Q_targets_next = agent.critic_target(next_states_full,
                                             target_actions_full)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards[:, agent_number].view(
            -1, 1) + (gamma * Q_targets_next *
                      (1 - dones[:, agent_number].view(-1, 1)))

        # Compute critic loss
        actions_full = actions.view(-1, self.action_size * self.num_agents)
        states_full = states.view(-1, self.num_agents * self.state_size)
        #         critic_input = torch.cat((states_full,actions_full), dim = 1)

        Q_expected = agent.critic(states_full, actions_full)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        #         critic_loss = huber_loss(Q_expected, Q_targets.detach())

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1)
        agent.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_full_pred = self.actors(states)
        #         critic_input_loss = torch.cat((states_batch, actions_full), dim = 1)
        actor_loss = -agent.critic(states_full, actions_full_pred).mean()

        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1)
        agent.actor_optimizer.step()

    def update_targets(self):
        """soft update target networks"""
        for agent in self.maddpg_agent:
            self.soft_update(agent.actor, agent.actor_target, TAU)
            self.soft_update(agent.critic, agent.critic_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset(self):
        for ddpg_agent in self.maddpg_agent:
            ddpg_agent.noise.reset()
Ejemplo n.º 21
0
    # 开始搜集数据
    obs = env.reset()
    obs = np.expand_dims(np.array(obs), axis=0)
    reset = True
    duration = []
    episode_start = 0
    episode_end = 0
    for t in range(total_timesteps):
        env.render()
        update_eps = tf.constant(exploration.value(t))
        action = agent.step(tf.constant(obs), update_eps=update_eps)
        action = action[0].numpy()  # tensor转换为numpy用于env输入
        reset = False
        new_obs, rew, done, _ = env.step(action)
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        replay_buffer.add(obs[0], action, rew, new_obs[0], float(done))

        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            episode_end = t
            duration.append(episode_end - episode_start)
            episode_start = t
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
def train(sess, env, args, actor_critic):
    sess.run(tf.global_variables_initializer())
    global_summary = tf.summary.FileWriter(
        'summaries/' + 'feeding_sac_all' +
        datetime.datetime.now().strftime('%d-%m-%y%H%M'), sess.graph)
    actor_critic.update_target_network()

    replay_buffer = ReplayBuffer(int(args['buffer_size']))
    pbar = tqdm(total=int(args['max_steps']), dynamic_ncols=True)
    tfirststart = time.perf_counter()
    total_step = 0

    while total_step < int(args['max_steps']):
        state = env.reset()
        episode_reward = 0
        end_step = 0
        while True:
            action, greedy_action = actor_critic.actor_predict([state])
            action = action[0]
            greedy_action = greedy_action[0]
            state2, reward, done, info = env.step(action)
            episode_reward += reward
            end_step += 1
            total_step += 1

            replay_buffer.add(state, action, reward, state2, done)

            state = state2

            if total_step > 100 * int(args['minibatch_size']):
                batch_state, batch_actions, batch_rewards, batch_state2, batch_dones = replay_buffer.sample(
                    int(args['minibatch_size']))
                actor_loss, critic_loss, value_loss, all_loss, _ = actor_critic.all_train(
                    batch_state, batch_state2, batch_actions, batch_rewards,
                    batch_dones)
                actor_critic.update_target_network()

                summary = tf.Summary()
                summary.value.add(tag='loss/value_loss',
                                  simple_value=value_loss)
                summary.value.add(tag='loss/critic_loss',
                                  simple_value=critic_loss)
                summary.value.add(tag='loss/actor_loss',
                                  simple_value=actor_loss)
                summary.value.add(tag='loss/total_loss', simple_value=all_loss)
                global_summary.add_summary(summary, total_step)
                global_summary.flush()

            if total_step % 1000000 == 0 and total_step != 0:
                tnow = time.perf_counter()
                print('consume time', tnow - tfirststart)
                savepath = osp.join("my_model_sac/", '%.5i' % total_step)
                os.makedirs(savepath, exist_ok=True)
                savepath = osp.join(savepath, 'sacmodel')
                print('Saving to', savepath)
                save_state(savepath)

            if done:
                success_time = env.success_time()
                fall_time = env.fall_times()
                msg = 'step: {},episode reward: {},episode len: {},success_time: {},fall_time: {}'
                pbar.update(total_step)
                pbar.set_description(
                    msg.format(total_step, episode_reward, end_step,
                               success_time, fall_time))
                summary = tf.Summary()
                summary.value.add(tag='Perf/Reward',
                                  simple_value=episode_reward)
                summary.value.add(tag='Perf/episode_len',
                                  simple_value=end_step)
                summary.value.add(tag='Perf/success_time',
                                  simple_value=success_time)
                summary.value.add(tag='Perf/fall_time', simple_value=fall_time)
                global_summary.add_summary(summary, total_step)
                global_summary.flush()
                break
Ejemplo n.º 23
0
class MADDPG():
    """Agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self, action_size=2, n_agents=2, seed=0):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            n_agents (int): number of agents
        """

        self.n_agents = n_agents
        self.t_step = 0
        self.noise_on = True

        # create two agents, each with their own actor and critic
        models = [
            model.Actor_Critic_Models(n_agents=n_agents)
            for _ in range(n_agents)
        ]
        self.agents = [DDPG(i, models[i]) for i in range(n_agents)]

        # create shared replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        self.t_step = self.t_step + 1
        if self.t_step % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = [
                    self.memory.sample() for _ in range(self.n_agents)
                ]
                self.learn(experiences, GAMMA)

    def act(self, all_states, add_noise=True):
        # pass each agent's state from the environment and calculate its action
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, add_noise=self.noise_on)
            #self.noise_weight *= noise_decay
            all_actions.append(action)
        return np.array(all_actions).reshape(
            1, -1)  # reshape 2x2 into 1x4 dim vector

    def learn(self, experiences, gamma):
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            # extract agent i's state and get action via actor network
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            # extract agent i's next state and get action via target actor network
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoint_actor_agent_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoint_critic_agent_{i}.pth")
Ejemplo n.º 24
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 lr=1e-3,
                 update_every=4,
                 batch_size=4,
                 buffer_size=64,
                 gamma=0.0994,
                 tau=1e-3,
                 model_path="model.pth"):
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        print("=== AGENT ===")
        print(f"Created agent on device: {self.device}")

        self.model_path = model_path
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.update_every = update_every
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # network variables
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.load()

        # Control variables
        self.memory = ReplayBuffer(action_size, buffer_size, self.batch_size,
                                   seed, self.device)
        self.t_step = 0

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss and backpropagate
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def save(self):
        torch.save(self.qnetwork_local.state_dict(), self.model_path)
        torch.save(self.qnetwork_target.state_dict(),
                   self.model_path.replace('.pth', '_target.pth'))
        print("Saved agent model.")

    def load(self):
        if (os.path.isfile(self.model_path)):
            self.qnetwork_local.load_state_dict(torch.load(self.model_path))
            self.qnetwork_target.load_state_dict(
                torch.load(self.model_path.replace('.pth', '_target.pth')))
            print(f"Loaded agent model: {self.model_path}")
Ejemplo n.º 25
0
class MADDPG_Trainer:
    def __init__(self, n_agents, act_spcs, ob_spcs, writer, args):
        self.args = args
        self.memory = ReplayBuffer(args.buffer_length, n_agents, device)
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_agents = n_agents
        self.act_spcs = act_spcs
        self.ob_spcs = ob_spcs
        self.agents = [
            DDPG_agent(self.act_spcs[i], self.ob_spcs[i], np.sum(self.ob_spcs),
                       np.sum(self.act_spcs)) for i in range(n_agents)
        ]

        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer
        self.criterion = nn.MSELoss()

    def get_actions(self, states):
        return [
            agent.select_action(state)[0]
            for agent, state in zip(self.agents, states)
        ]

    def store_transitions(self, states, actions, rewards, next_states, dones):
        self.memory.add(states, actions, rewards, next_states, dones)

    def reset(self):
        pass

    def transform_states(self, states, N):
        obses = []
        for i in range(N):
            states_ = []
            for j in range(self.n_agents):
                states_.append(states[j][i])
            obses.append(torch.cat([f.float().to(device) for f in states_]))
        return torch.stack(obses)

    def transform_actions(self, actions, N):
        acts = []
        for i in range(N):
            actions_ = []
            for j in range(self.n_agents):
                actions_.append(actions[j][i])
            acts.append(torch.cat([f.float().to(device) for f in actions_]))
        return torch.stack(acts)

    def update_all_targets(self):
        for agent in self.agents:
            soft_update(agent.policy_targ, agent.policy, TAU)
            soft_update(agent.qnet_targ, agent.qnet, TAU)

    def prep_training(self):
        for agent in self.agents:
            agent.qnet.train()
            agent.policy.train()
            agent.qnet_targ.train()
            agent.policy_targ.train()

    def eval(self):
        for agent in self.agents:
            agent.qnet.eval()
            agent.policy.eval()
            agent.qnet_targ.eval()
            agent.policy_targ.eval()

    def sample_and_train(self, batch_size):
        # TODO ADD Model saving, optimize code
        batch = self.memory.sample(min(batch_size, len(self.memory)))

        states_i, actions_i, rewards_i, next_states_i, dones_i = batch

        states_all = torch.cat(states_i, 1)
        next_states_all = torch.cat(next_states_i, 1)
        actions_all = torch.cat(actions_i, 1)

        for i, agent in enumerate(self.agents):
            next_actions_all = [
                onehot_from_logits(ag.policy_targ(next_state))
                for ag, next_state in zip(self.agents, next_states_i)
            ]
            # computing target
            total_obs = torch.cat(
                [next_states_all,
                 torch.cat(next_actions_all, 1)], 1)
            target_q = self.agents[i].qnet_targ(total_obs).detach()
            rewards = rewards_i[i].view(-1, 1)
            dones = dones_i[i].view(-1, 1)
            target_q = rewards + (1 - dones) * GAMMA * target_q

            # computing the inputs
            input_q = self.agents[i].qnet(
                torch.cat([states_all, actions_all], 1))
            self.agents[i].q_optimizer.zero_grad()
            loss = self.criterion(input_q, target_q.detach())
            # print("LOSS", loss)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(),
                                           0.5)
            self.agents[i].q_optimizer.step()
            actor_loss = 0
            # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø

            # use gumbel softmax max temp trick
            policy_out = self.agents[i].policy(states_i[i])
            gumbel_sample = gumbel_softmax(policy_out, hard=True)

            actions_curr_pols = [
                onehot_from_logits(agent_.policy(state))
                for agent_, state in zip(self.agents, states_i)
            ]

            for action_batch in actions_curr_pols:
                action_batch.detach_()
            actions_curr_pols[i] = gumbel_sample

            actor_loss = -self.agents[i].qnet(
                torch.cat(
                    [states_all.detach(),
                     torch.cat(actions_curr_pols, 1)], 1)).mean()
            actor_loss += (policy_out**2).mean() * 1e-3

            self.agents[i].p_optimizer.zero_grad()
            actor_loss.backward()
            # nn.utils.clip_grad_norm_(self.policy.parameters(), 5)
            torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(),
                                           0.5)
            self.agents[i].p_optimizer.step()
            # detach the forward propagated action samples
            actions_i[i].detach_()

            if self.args.use_writer:
                self.writer.add_scalars("Agent_%i" % i, {
                    "vf_loss": loss,
                    "actor_loss": actor_loss
                }, self.n_updates)

        self.update_all_targets()
        self.n_updates += 1
Ejemplo n.º 26
0
class MADDPG_Trainer:

    def __init__(self, n_agents, act_spcs, ob_spcs, writer, args):
        self.args = args
        self.memory = ReplayBuffer(args.buffer_length, n_agents, device)
        # self.memory = ReplayMemory(args.buffer_length, n_agents, device)
        self.use_maddpg = args.algo == "maddpg"
        self.use_sac = args.use_sac
        self.use_td3 = args.use_td3
        self.use_single_q = args.single_q
        self.all_obs = args.all_obs
        self.n_agents = n_agents
        self.act_spcs = act_spcs
        self.ob_spcs = ob_spcs
        qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i]
                        for i in range(n_agents)]
        qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i]
                        for i in range(n_agents)]
        if self.use_sac and not self.use_td3:
            self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                     else self.ob_spcs[i], qnet_obspcs[i],
                           qnet_actspcs[i]) for i in range(n_agents)]
        elif self.use_td3:
            self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                      else self.ob_spcs[i], qnet_obspcs[i],
                           qnet_actspcs[i]) for i in range(n_agents)]
        else:
            self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                      else self.ob_spcs[i], qnet_obspcs[i],
                           qnet_actspcs[i]) for i in range(n_agents)]
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer
        self.criterion = nn.MSELoss()
        self.sac_alpha = args.sac_alpha
        self.agent_actions = [[] for i in range(self.n_agents)]

    def plot_actions(self):
        for i in range(self.n_agents):
            sns.distplot(self.agent_actions[i], bins=self.agents[i].act_sp, kde=False)
            # __import__('ipdb').set_trace()
            plt.show()

    def get_actions(self, states):
        result = []
        # with torch.no_grad():
        for i, (agent, state) in enumerate(zip(self.agents, states)):
            action = agent.select_action(state)[0]
            result.append(action)
            # if self.args.use_writer: self.agent_actions[i].append(np.argmax(action.cpu()).item())
        self.n_steps += 1
        return result


    def store_transitions(self, states, actions, rewards, next_states, dones):
        # print(sys.getsizeof(states) + sys.getsizeof(actions) + sys.getsizeof(rewards)
        #       + sys.getsizeof(next_states) + sys.getsizeof(dones))
        self.memory.add(states, actions, rewards, next_states, dones)

    def reset(self):
        pass

    def transform_states(self, states, N):
        obses = []
        for i in range(N):
            states_ = []
            for j in range(self.n_agents):
                states_.append(states[j][i])
            obses.append(torch.cat([f.float().to(device) for f in states_]))
        return torch.stack(obses)

    def transform_actions(self, actions, N):
        acts = []
        for i in range(N):
            actions_ = []
            for j in range(self.n_agents):
                actions_.append(actions[j][i])
            acts.append(torch.cat([f.float().to(device) for f in actions_]))
        return torch.stack(acts)

    def update_all_targets(self):
        for agent in self.agents:
            agent.update_targets(TAU)

    def prep_training(self):
        for agent in self.agents:
            agent.set_train() 

    def eval(self):
        for agent in self.agents:
            agent.set_eval()

    def sample_and_train_td3(self, batch_size):
        t = self.n_steps
        # print(self.n_steps) 
        update_every = self.agents[0].update_every
        update_after = self.agents[0].update_after
        if (t + 1) > update_after and (t + 1) % update_every == 0:
            for i in range(update_every):
                 self.train_td3(batch_size, i)

    def batch_add_random_acts(self, tensor, ag_i):
        # __import__('ipdb').set_trace()
        n_clip  =self.agents[ag_i].target_noise_clip
        noise = (self.agents[ag_i].target_noise**0.5)*torch.randn(tensor.shape)
        noise = torch.clamp(noise, -n_clip, n_clip)
        tensor[:] = tensor[:] + noise
        # __import__('ipdb').set_trace()

    def train_td3(self, batch_size):
        self.n_updates += 1
        batch = self.memory.sample(min(batch_size, len(self.memory)))
        states_i, actions_i, rewards_i, next_states_i, dones_i = batch
        # __import__('ipdb').set_trace()
        if self.use_maddpg:
            states_all = torch.cat(states_i, 1)
            next_states_all = torch.cat(next_states_i, 1)
            actions_all = torch.cat(actions_i, 1)
        for i, agent in enumerate(self.agents):
            # print("training_qnet")
            if not self.use_maddpg:
                states_all = states_i[i]
                next_states_all = next_states_i[i]
                actions_all = actions_i[i]
            if self.use_maddpg:  
                next_actions_all = [ag.policy(next_state)
                                    for ag, next_state in zip(self.agents, next_states_i)]

                [self.batch_add_random_acts(e, i) for i, e in enumerate(next_actions_all)]
                next_actions_all = [onehot_from_logits(e) for e in next_actions_all]
            else:
                actions_and_logits = [onehot_from_logits(agent.policy(next_states_i[i]))]
                next_actions_all = [e[0] for e in actions_and_logits]
            total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1)
            qnet_targs = []
            for qnet in self.agents[i].qnet_targs:
                qnet_targs.append(qnet(total_obs).detach())
            rewards = rewards_i[i].view(-1, 1)
            dones = dones_i[i].view(-1, 1)
            qnet_mins = torch.min(qnet_targs[0], qnet_targs[1])
            target_q = rewards + (1 - dones) * GAMMA * (qnet_mins)
            losses = []
            for j, qnet in enumerate(self.agents[i].qnets):
                input_q = qnet(torch.cat([states_all, actions_all], 1))
                self.agents[i].q_optimizers[j].zero_grad()
                loss = self.criterion(input_q, target_q.detach())
                losses.append(loss.item())
                loss.backward()
                # torch.nn.utils.clip_grad_norm_(qnet.parameters(), 0.5)
                self.agents[i].q_optimizers[j].step()


        if self.args.use_writer:
            self.writer.add_scalar(f"Agent_{i}: q_net_loss: ", np.mean(losses), self.n_updates)
        if self.n_updates % 2 == 0:
            for i in range(self.n_agents):
                # print("training policy")
                actor_loss = 0
                # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø
                # use gumbel softmax max temp trick
                policy_out = self.agents[i].policy(states_i[i])
                gumbel_sample = gumbel_softmax(policy_out, hard=True)
                if self.use_maddpg:
                    actions_curr_pols = [onehot_from_logits(agent_.policy(state))
                                         for agent_, state in zip(self.agents, states_i)]

                    for action_batch in actions_curr_pols:
                        action_batch.detach_()
                    actions_curr_pols[i] = gumbel_sample
                    actor_loss = - self.agents[i].qnets[0](torch.cat([states_all.detach(),
                                                       torch.cat(actions_curr_pols, 1)], 1)).mean()
                else:
                    actor_loss = - self.agents[i].qnets[0](torch.cat([states_all.detach(),
                                                       gumbel_sample], 1)).mean()
                self.agents[i].p_optimizer.zero_grad()
                actor_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5)
                self.agents[i].p_optimizer.step()
                actions_i[i].detach_()
                if self.args.use_writer:
                    self.writer.add_scalar(f"Agent_{i}: policy_objective: ", actor_loss.item(), self.n_updates)
                self.update_all_targets()
        # self.n_updates += 1

    def sample_and_train_sac(self, batch_size):
        # TODO ADD Model saving, optimize code
        batch = self.memory.sample(min(batch_size, len(self.memory)))
        states_i, actions_i, rewards_i, next_states_i, dones_i = batch
        # __import__('ipdb').set_trace()        
        if self.use_maddpg:
            states_all = torch.cat(states_i, 1)
            next_states_all = torch.cat(next_states_i, 1)
            actions_all = torch.cat(actions_i, 1)
        for i, agent in enumerate(self.agents):
            if not self.use_maddpg:
                states_all = states_i[i]
                next_states_all = next_states_i[i]
                actions_all = actions_i[i]
            if self.use_maddpg:  
                actions_and_logits = [onehot_from_logits(ag.policy(next_state), logprobs=True)
                                    for ag, next_state in zip(self.agents, next_states_i)]

                next_actions_all = [e[0] for e in actions_and_logits]
                next_logits_all = [self.sac_alpha*e[1] for e in actions_and_logits]
                # __import__('ipdb').set_trace()
            else:
                actions_and_logits = [onehot_from_logits(agent.policy(next_states_i[i]),
                                                       logprobs=True)]
                next_actions_all = [e[0] for e in actions_and_logits]
                next_logits_all = [self.sac_alpha*e[1] for e in actions_and_logits]
                
            # computing target
            total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1)
            
            # target_q = self.agents[i].qnet_targ(total_obs).detach()
            qnet_targs = []
            for qnet in self.agents[i].qnet_targs:
                qnet_targs.append(qnet(total_obs).detach())
            rewards = rewards_i[i].view(-1, 1)
            dones = dones_i[i].view(-1, 1)
            qnet_mins = torch.min(qnet_targs[0], qnet_targs[1])
            # __import__('ipdb').set_trace()
            logits_idx = i if self.use_maddpg else 0
            logits_agent = next_logits_all[logits_idx]
            # if len(qnet_mins.squeeze(-1)) != len(logits_agent.squeeze(-1)):
            #     __import__('ipdb').set_trace()
            target_q = rewards + (1 - dones) * GAMMA * (qnet_mins -
                                     logits_agent.reshape(qnet_mins.shape))
            # __import__('ipdb').set_trace()
            # computing the inputs
            for j, qnet in enumerate(self.agents[i].qnets):
                input_q = qnet(torch.cat([states_all, actions_all], 1))
                self.agents[i].q_optimizers[j].zero_grad()
                # print("----")
                # __import__('ipdb').set_trace() 
                loss = self.criterion(input_q, target_q.detach())
                # print('after')
                loss.backward()
                torch.nn.utils.clip_grad_norm_(qnet.parameters(), 0.5)
                self.agents[i].q_optimizers[j].step()

            # __import__('ipdb').set_trace()
            actor_loss = 0
            # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø
            # use gumbel softmax max temp trick
            policy_out = self.agents[i].policy(states_i[i])
            gumbel_sample, act_logprobs = gumbel_softmax(policy_out, hard=True, logprobs=True)
            act_logprobs = self.sac_alpha*act_logprobs
            # __import__('ipdb').set_trace() 
            if self.use_maddpg:
                with torch.no_grad():
                    actions_curr_pols = [onehot_from_logits(agent_.policy(state))
                                         for agent_, state in zip(self.agents, states_i)]
                actions_curr_pols[i] = gumbel_sample
                total_obs = torch.cat([states_all, torch.cat(actions_curr_pols, 1)], 1)
                qnet_outs = []
                for qnet in self.agents[i].qnets:
                    qnet_outs.append(qnet(total_obs))
                qnet_mins = torch.min(qnet_outs[0], qnet_outs[1])
                actor_loss = - qnet_mins.mean()
                # __import__('ipdb').set_trace()
            else:
                # actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(),
                #                                    gumbel_sample], 1)).mean()
                # actions_curr_pols[i] = gumbel_sample
                # __import__('ipdb').set_trace()
                total_obs = torch.cat([states_all, gumbel_sample], 1)
                qnet_outs = []
                for qnet in self.agents[i].qnets:
                    qnet_outs.append(qnet(total_obs))
                qnet_mins = torch.min(qnet_outs[0], qnet_outs[1])
                actor_loss = - qnet_mins.mean()
            # actor_loss += (policy_out**2).mean() * 1e-3

            self.agents[i].p_optimizer.zero_grad()
            actor_loss.backward()
            # nn.utils.clip_grad_norm_(self.policy.parameters(), 5)
            # torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5)
            self.agents[i].p_optimizer.step()
            # detach the forward propagated action samples
            actions_i[i].detach_()
            # __import__('ipdb').set_trace()
            if self.args.use_writer:
                self.writer.add_scalars("Agent_%i" % i, {
                    "vf_loss": loss,
                    "actor_loss": actor_loss
                }, self.n_updates)
        
        self.update_all_targets()
        self.n_updates += 1

    def sample_and_train(self, batch_size):
        return
        # TODO ADD Model saving, optimize code
        batch = self.memory.sample(min(batch_size, len(self.memory)))
        states_i, actions_i, rewards_i, next_states_i, dones_i = batch
        # __import__('ipdb').set_trace()        
        if self.use_maddpg:
            states_all = torch.cat(states_i, 1)
            next_states_all = torch.cat(next_states_i, 1)
            actions_all = torch.cat(actions_i, 1)
        for i, agent in enumerate(self.agents):
            if not self.use_maddpg:
                states_all = states_i[i]
                next_states_all = next_states_i[i]
                actions_all = actions_i[i]
            if self.use_maddpg:  
                next_actions_all = [onehot_from_logits(ag.policy_targ(next_state))
                                    for ag, next_state in zip(self.agents, next_states_i)]
            else:
                next_actions_all = [onehot_from_logits(agent.policy_targ(next_states_i[i]))]
            # computing target
            total_obs = torch.cat([next_states_all, torch.cat(next_actions_all, 1)], 1)
            target_q = self.agents[i].qnet_targ(total_obs).detach()
            rewards = rewards_i[i].view(-1, 1)
            dones = dones_i[i].view(-1, 1)
            target_q = rewards + (1 - dones) * GAMMA * target_q

            # computing the inputs
            input_q = self.agents[i].qnet(torch.cat([states_all, actions_all], 1))
            self.agents[i].q_optimizer.zero_grad()
            loss = self.criterion(input_q, target_q.detach())
            # print("LOSS", loss)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.agents[i].qnet.parameters(), 0.5)
            self.agents[i].q_optimizer.step()
            actor_loss = 0
            # ACTOR gradient ascent of Q(s, π(s | ø)) with respect to ø
            # use gumbel softmax max temp trick
            policy_out = self.agents[i].policy(states_i[i])
            gumbel_sample = gumbel_softmax(policy_out, hard=True)
            if self.use_maddpg:
                actions_curr_pols = [onehot_from_logits(agent_.policy(state))
                                     for agent_, state in zip(self.agents, states_i)]

                for action_batch in actions_curr_pols:
                    action_batch.detach_()
                actions_curr_pols[i] = gumbel_sample

                actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(),
                                                   torch.cat(actions_curr_pols, 1)], 1)).mean()
            else:
                actor_loss = - self.agents[i].qnet(torch.cat([states_all.detach(),
                                                   gumbel_sample], 1)).mean()
            actor_loss += (policy_out**2).mean() * 1e-3

            self.agents[i].p_optimizer.zero_grad()
            actor_loss.backward()
            # nn.utils.clip_grad_norm_(self.policy.parameters(), 5)
            torch.nn.utils.clip_grad_norm_(self.agents[i].policy.parameters(), 0.5)
            self.agents[i].p_optimizer.step()
            # detach the forward propagated action samples
            actions_i[i].detach_()
            # __import__('ipdb').set_trace()
            if self.args.use_writer:
                self.writer.add_scalars("Agent_%i" % i, {
                    "vf_loss": loss,
                    "actor_loss": actor_loss
                }, self.n_updates)
        
        self.update_all_targets()
        self.n_updates += 1
Ejemplo n.º 27
0
class MADDPGAgent:
    """Interacts and learns from the environment using multiple DDPG agents"""
    def __init__(self):
        """Initialize a MADDPG Agent object."""
        super(MADDPGAgent, self).__init__()
        self.config = Config.getInstance()
        self.action_num = self.config.action_size * self.config.num_agents
        self.t_step = 0

        self.maddpg_agent = [
            DDPGAgent() for _ in range(self.config.num_agents)
        ]

        self.memory = ReplayBuffer()

    def get_actors(self):
        """get actors of all the agents in the MADDPG object"""
        actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent]
        return actors

    # def get_target_actors(self):
    #     """get target_actors of all the agents in the MADDPG object"""
    #     target_actors = [
    #         ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent]
    #     return target_actors

    def act(self, obs_all_agents, noise=0.0):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(obs, noise)
            for agent, obs in zip(self.maddpg_agent, obs_all_agents)
        ]
        return np.concatenate(actions)

    def update_act(self, obs_all_agents, agent_num, noise_decay_parameter=0.0):
        """
        get target network actions from all the agents in the MADDPG object
        """
        actions_ = []
        for a_i, ddpg_agent in enumerate(self.maddpg_agent):
            obs = obs_all_agents[:, a_i, :].to(self.config.device)
            acn = ddpg_agent.actor(
                obs) + noise_decay_parameter * ddpg_agent.noise.sample()
            if a_i != agent_num:
                acn = acn.detach()
            actions_.append(acn)
        return actions_

    def target_act(self, obs_all_agents, noise=0.0):
        """
        get target network actions from all the agents in the MADDPG object
        """
        target_actions = [
            ddpg_agent.target_act(obs_all_agents[:, a_i, :], noise)
            for a_i, ddpg_agent in enumerate(self.maddpg_agent)
        ]
        return target_actions

    def step(self, _states, _actions, _rewards, _next_states, _dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        states_full = np.reshape(_states, newshape=(-1))
        next_states_full = np.reshape(_next_states, newshape=(-1))
        self.memory.add(_states, states_full, _actions, _rewards, _next_states,
                        next_states_full, _dones)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.config.update_every

        if self.t_step == 0:
            if len(self.memory) > self.config.batch_size:
                for a_i in range(self.config.num_agents):
                    samples = self.memory.sample()
                    self.update(samples, a_i)
                self.update_targets()

    def update_critic(self, samples, agent_number):
        """Update critic weights"""
        states, states_full, actions, rewards, next_states, next_states_full, dones = samples
        agent = self.maddpg_agent[agent_number]
        agent.critic_optimizer.zero_grad()
        # ---------------------------- update critic ---------------------- #
        actions_next = self.target_act(next_states)
        actions_next = torch.cat(actions_next, dim=1)

        Q_target_next = agent.target_critic(next_states_full, actions_next)
        Q_targets = rewards[:, agent_number].view(-1, 1) + self.config.gamma * \
            Q_target_next * (1 - dones[:, agent_number].view(-1, 1))
        Q_expected = agent.critic(states_full,
                                  actions.reshape(-1, self.action_num))
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        critic_loss.backward()
        agent.critic_optimizer.step()

    def update_actor(self, samples, agent_number):
        """Update actor weights"""
        states, states_full, actions, rewards, next_states, next_states_full, dones = samples
        agent = self.maddpg_agent[agent_number]

        agent.actor_optimizer.zero_grad()
        actions_pred = self.update_act(states, agent_number)
        actions_pred = torch.cat(actions_pred, dim=1)
        actor_loss = -agent.critic(states_full, actions_pred).mean()
        actor_loss.backward()
        agent.actor_optimizer.step()

    def update(self, samples, agent_number):
        """update the critics and actors of all the agents """
        # ---------------------------- update critic ---------------------- #
        self.update_critic(samples, agent_number)

        # ---------------------------- update actor ------------------------- #
        self.update_actor(samples, agent_number)

    def update_targets(self):
        """soft update targets"""
        for ddpg_agent in self.maddpg_agent:
            soft_update(ddpg_agent.target_actor, ddpg_agent.actor,
                        self.config.tau)
            soft_update(ddpg_agent.target_critic, ddpg_agent.critic,
                        self.config.tau)

    def reset(self):
        """Resets weight of all agents"""
        for ddpg_agent in self.maddpg_agent:
            ddpg_agent.reset()
Ejemplo n.º 28
0
class DDPGAgent:
    def __init__(self, state_size, action_size, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random_seed

        # ------------------ actor ------------------ #
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        # ------------------ critic ----------------- #
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        # ------------------ optimizers ------------- #
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        # Replay Buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
		θ_target = τ*θ_local + (1 - τ)*θ_target

		Params
		======
			local_model: PyTorch model (weights will be copied from)
			target_model: PyTorch model (weights will be copied to)
			tau (float): interpolation parameter 
		"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.actor_local.state_dict(), 'actor_checkpoint_actor.pth')
        torch.save(self.critic_local.state_dict(),
                   'critic_checkpoint_critic.pth')
Ejemplo n.º 29
0
class MADDPG(MultiAgentAlgorithm):
    def __init__(self, action_size, n_agents, seed, state_size):
        super().__init__(action_size, n_agents, seed)

        # critic input = obs_full + actions = 14+2+2+2=20
        self.agents = [
            DDPGAgent(state_size, ACTOR_FC1_UNITS, ACTOR_FC2_UNITS,
                      action_size, (state_size + action_size) * n_agents,
                      CRITIC_FC1_UNITS, CRITIC_FC2_UNITS, LR_ACTOR, LR_CRITIC,
                      WEIGHT_DECAY_ACTOR, WEIGHT_DECAY_CRITIC)
            for i in range(n_agents)
        ]
        self.n_agents = n_agents
        self.epsilon = 0
        self.iter = 0
        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)

    def save_model(self, model_file):
        """Save networks and all other model parameters
        
        Params
        ======
            model_file (string): name of the file that will store the model
        """
        checkpoint = {
            'actor_local1': self.agents[0].actor.state_dict(),
            'critic_local1': self.agents[0].critic.state_dict(),
            'actor_target1': self.agents[0].target_actor.state_dict(),
            'critic_target1': self.agents[0].target_critic.state_dict(),
            'actor_local2': self.agents[1].actor.state_dict(),
            'critic_local2': self.agents[1].critic.state_dict(),
            'actor_target2': self.agents[1].target_actor.state_dict(),
            'critic_target2': self.agents[1].target_critic.state_dict()
        }

        torch.save(checkpoint, model_file)

    def load_model(self, model_file):
        """Load networks and all other model parameters
        
        Params
        ======
            model_file (string): name of the file that stores the model
        """
        checkpoint = torch.load(model_file)
        self.agents[0].actor.load_state_dict(checkpoint['actor_local1'])
        self.agents[0].critic.load_state_dict(checkpoint['critic_local1'])
        self.agents[0].target_actor.load_state_dict(
            checkpoint['actor_target1'])
        self.agents[0].target_critic.load_state_dict(
            checkpoint['critic_target1'])
        self.agents[1].actor.load_state_dict(checkpoint['actor_local2'])
        self.agents[1].critic.load_state_dict(checkpoint['critic_local2'])
        self.agents[1].target_actor.load_state_dict(
            checkpoint['actor_target2'])
        self.agents[1].target_critic.load_state_dict(
            checkpoint['critic_target2'])

    def act(self, states):
        """get actions from all agents in the MADDPG object"""

        actions = []
        for agent, state in zip(self.agents, states):
            if np.random.rand() < self.epsilon:
                actions_agent = np.random.randn(2)
                actions_agent = np.clip(actions_agent, -1, 1)
                actions.append(actions_agent)
            else:
                actions.append(agent.act(state))
        return actions

    def target_act(self, states):
        """get target network actions from all the agents in the MADDPG object """
        target_actions = [
            agent.target_act(obs) for agent, obs in zip(self.agents, states)
        ]
        return target_actions

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn.

        Params
        ======
            states (array_like): current state (for each agent)
            actions (array_like): action taken at the current state (for each agent) 
            rewards (array_like): reward from an action (for each agent)
            next_states (array_like): next state of environment (for each agent)
            dones (array_like): true if the next state is the final one, false otherwise (for each agent)
        """

        # Save experience / reward
        self.buffer.add(states, actions, rewards, next_states, dones)

        self.iter = (self.iter + 1) % UPDATE_EVERY
        if self.iter == 0:

            # Learn, if enough samples are available in buffer
            if len(self.buffer) > BATCH_SIZE:
                for i in range(N_UPDATES):
                    experiences = self.buffer.sample()
                    for agent in range(self.n_agents):
                        self.learn(experiences, agent)
                        self.update_targets(agent)

    def learn(self, experiences, agent_number):
        """update the critics and actors of all the agents """

        # need to transpose each element of the samples
        # to flip obs[parallel_agent][agent_number] to
        # obs[agent_number][parallel_agent]

        states, actions, rewards, next_states, dones = experiences

        agent = self.agents[agent_number]
        agent.critic_optimizer.zero_grad()

        #critic loss = batch mean of (y- Q(s,a) from target network)^2
        #y = reward of this timestep + discount * Q(st+1,at+1) from target network

        target_actions = self.target_act(next_states)
        target_actions = torch.cat(target_actions, dim=1)
        t = torch.tensor(transpose_list(next_states.cpu().data.numpy()))
        next_states_all = t.view(t.shape[0], -1).to('cpu')
        target_critic_input = torch.cat(
            (next_states_all, target_actions.to('cpu')), dim=1).to(device)

        with torch.no_grad():
            q_next = agent.target_critic(target_critic_input)

        y = rewards[agent_number].view(
            -1, 1) + GAMMA * q_next * (1 - dones[agent_number].view(-1, 1))
        actions_all = torch.cat(torch.unbind(actions), dim=1)
        t = torch.tensor(transpose_list(states.cpu().data.numpy()))
        states_all = t.view(t.shape[0], -1).to('cpu')
        critic_input = torch.cat((states_all, actions_all.to('cpu')),
                                 dim=1).to(device)
        q = agent.critic(critic_input)

        critic_loss = F.mse_loss(q, y.detach())
        critic_loss.backward(retain_graph=True)
        agent.critic_optimizer.step()

        # update actor network using policy gradient
        agent.actor_optimizer.zero_grad()

        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [self.agents[i].actor(state) if i == agent_number \
                   else self.agents[i].actor(state).detach()
                   for i, state in enumerate(states)]
        q_input = torch.cat(q_input, dim=1)

        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already
        q_input2 = torch.cat((states_all.to('cpu'), q_input.to('cpu')), dim=1)

        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward(retain_graph=True)
        agent.actor_optimizer.step()

    def update_targets(self, i):
        """soft update targets"""
        soft_update(self.agents[i].target_actor, self.agents[i].actor, TAU)
        soft_update(self.agents[i].target_critic, self.agents[i].critic, TAU)
Ejemplo n.º 30
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 1024
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            rewards = self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        return rewards

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.noise())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        return rewards

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)