Ejemplo n.º 1
0
class MAgent():
    def __init__(self, state_size, action_size, num_agents, random_seed,
                 shared_replay_buffer):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.shared_replay_buffer = shared_replay_buffer

        self.t_step = 0

        if shared_replay_buffer:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)
            shared_memory = self.memory
        else:
            shared_memory = None
            self.memory = None

        print("ma shared_memory -> ", shared_memory)

        self.ddpg_agents = [
            Agent(state_size, action_size, random_seed, shared_memory)
            for _ in range(num_agents)
        ]
#         print("MAgent: number of agents: ->", num_agents)
#         print("Enter into ddpg Agent")

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()

    def act(self, all_states):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.ddpg_agents, all_states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            #             print(len(self.memory))
            if len(self.memory) > BATCH_SIZE:
                for agent in self.ddpg_agents:
                    if self.shared_replay_buffer:
                        experiences = self.memory.sample()
                    else:
                        experiences = agent.memory.sample()

                    agent.learn(experiences, GAMMA)
Ejemplo n.º 2
0
 def __init__(self, config):
     self.config = config
     self.n_agents = config.env.n_agents
     self.ddpg_agents = [
         Agent(i, config) for i in range(self.config.env.n_agents)
     ]
     # the shared replay buffer
     self.memory = ReplayBuffer(config)
     self.t_step = 0
Ejemplo n.º 3
0
    def __init__(self, state_size, action_size, num_agents, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)

        self.agents = [Agent(state_size, action_size, random_seed)
                       ] * num_agents
        self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          random_seed)
 def __init__(self, config):
     self.config = config
     # Replay memory
     self.memory = ReplayBuffer(self.config.action_size,
                                self.config.buffer_size,
                                self.config.batch_size, self.config.seed)
     self.agents = [
         Agent(self.config) for _ in range(self.config.num_agents)
     ]
     # 'action_size', 'num_agents', and 'random_seed'
     #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)]
     self.t_step = 0
     self.loss = (0.0, 0.0)
Ejemplo n.º 5
0
class MADDPGAgent:
    def __init__(self, state_size, action_size, num_agents, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)

        self.agents = [Agent(state_size, action_size, random_seed)
                       ] * num_agents
        self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.shared_memory.add(state, action, reward, next_state, done)

        if len(self.shared_memory) > BATCH_SIZE and step % LEARN_EVERY == 0:
            for _ in range(LEARN_N_TIMES):
                for agent in self.agents:
                    experiences = self.shared_memory.sample()
                    agent.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        actions = []
        for state, agent in zip(states, self.agents):
            state = np.expand_dims(state, axis=0)
            action = agent.act(state)
            action = np.reshape(action, newshape=(-1))
            actions.append(action)
        actions = np.stack(actions)
        return actions

    def save_weights(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       'checkpoint_actor_' + str(i) + '.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic_' + str(i) + '.pth')

    def load_weights(self):
        for i, agent in enumerate(self.agents):
            agent.actor_local.load_state_dict(
                torch.load('checkpoint_actor_' + str(i) + '.pth'))
            agent.critic_local.load_state_dict(
                torch.load('checkpoint_critic_' + str(i) + '.pth'))

    def reset(self):
        for agent in self.agents:
            agent.reset()
Ejemplo n.º 6
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(ddpg.device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(ddpg.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ddpg.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(ddpg.device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(ddpg.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=ddpg.LR_CRITIC,
                                           weight_decay=ddpg.WEIGHT_DECAY)

        # Replay memory
        self.memory = ReplayBuffer(action_size, ddpg.BUFFER_SIZE,
                                   ddpg.BATCH_SIZE, random_seed)

        # Create agents
        self.agents = []
        for i in range(num_agents):
            agent = Agent(self, state_size, action_size, random_seed)
            self.agents.append(agent)
Ejemplo n.º 7
0
 def __init__(self, num_agents=2, state_size=24, action_size=2):
     """Initialize a maddpg_agent wrapper.
     Params
     ======
         num_agents (int): the number of agents in the environment
         state_size (int): dimension of each state
         action_size (int): dimension of each action
     """
     self.num_agents = num_agents
     self.state_size = state_size
     self.action_size = action_size
     
     self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)]
     
     # Replay memory
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
Ejemplo n.º 8
0
    def __init__(self, action_size=2, seed=42, n_agents=2):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            n_agents (int): number of distinct agents
        """

        self.n_agents = n_agents
        self.timestep = 0

        self.agents = [DDPG(i) for i in range(n_agents)]

        # common buffer for both the agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
class MADDPG:
    def __init__(self, config):
        self.config = config
        # Replay memory
        self.memory = ReplayBuffer(self.config.action_size,
                                   self.config.buffer_size,
                                   self.config.batch_size, self.config.seed)
        self.agents = [
            Agent(self.config) for _ in range(self.config.num_agents)
        ]
        # 'action_size', 'num_agents', and 'random_seed'
        #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)]
        self.t_step = 0
        self.loss = (0.0, 0.0)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, add_noise=True):
        actions = [
            agent.act(state, self.t_step, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if self.t_step % self.config.update_every == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.batch_size:
                closs = []
                aloss = []
                for agent in self.agents:
                    experiences = self.memory.sample()
                    critic_loss, actor_loss = agent.learn(
                        experiences, self.config.discount)
                    closs.append(critic_loss)
                    aloss.append(actor_loss)
                self.loss = (np.mean(closs), np.mean(aloss))
Ejemplo n.º 10
0
 def __init__(self, num_agents, state_size, action_size, random_seed):
     self.num_agents = num_agents
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                random_seed)
     self.agents = [
         Agent(state_size, action_size, self.memory, BATCH_SIZE,
               random_seed) for agent_posit in range(num_agents)
     ]
    def __init__(self,
                 state_size=24,
                 action_size=2,
                 n_agents=2,
                 buffer_size=100000,
                 batch_size=256,
                 gamma=0.999,
                 update_every=4,
                 noise_start=1.0,
                 noise_decay=1.0,
                 t_stop_noise=30000,
                 seed=0):
        """
        Params
        ======
            action_size (int): dimension of each action
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            t_stop_noise (int): max number of timesteps with noise applied in training
            seed (int): Random seed
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.noise_on = True
        self.t_stop_noise = t_stop_noise

        #         models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(i, state_size, action_size, n_agents) for i in range(n_agents)
        ]
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)
def ddpg_train(n_episodes, seed, buffer_size, batch_size, gamma, tau, lr_actor,
               lr_critic, weight_decay):
    memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
    agents = [
        Agent(state_size, action_size, seed, buffer_size, batch_size, gamma,
              tau, lr_actor, lr_critic, weight_decay, memory)
        for _ in range(num_agents)
    ]
    load(agents)
    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        episode_scores = np.zeros(num_agents)
        while True:
            for agent in agents:
                agent.reset()
            actions = list()
            for agent, state in zip(agents, states):
                actions.append(agent.act(state))
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            for agent, state, action, reward, next_state, done in zip(
                    agents, states, actions, rewards, next_states, dones):
                agent.step(state, action, reward, next_state, done)
            states = next_states
            episode_scores += np.array(rewards)
            if np.any(dones):
                break
        score = episode_scores.max()
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode: \t{} \tScore: \t{:.2f} \tAverage Score: \t{:.2f}'.
              format(i_episode, np.mean(score), np.mean(scores_deque)),
              end="")
        if i_episode % 10 == 0:
            save(agents)
        if np.mean(scores_deque) >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                .format(i_episode, np.mean(scores_deque)))
            break
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid()
    ax.plot(np.arange(len(scores)), scores)
    ax.set(xlabel="Episode #", ylabel="'Score", title="DDPG Network")
    fig.savefig("ddpg_network.pdf")
Ejemplo n.º 13
0
    def __init__(self, state_size, action_size, num_agents, random_seed,
                 shared_replay_buffer):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.shared_replay_buffer = shared_replay_buffer

        self.t_step = 0

        if shared_replay_buffer:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)
            shared_memory = self.memory
        else:
            shared_memory = None
            self.memory = None

        print("ma shared_memory -> ", shared_memory)

        self.ddpg_agents = [
            Agent(state_size, action_size, random_seed, shared_memory)
            for _ in range(num_agents)
        ]
Ejemplo n.º 14
0
class maddpg_agent:
    """Wrapper class managing different agents in the environment."""

    def __init__(self, num_agents=2, state_size=24, action_size=2):
        """Initialize a maddpg_agent wrapper.
        Params
        ======
            num_agents (int): the number of agents in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        
        self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)]
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
        
    def reset(self):
        """Resets OU Noise for each agent."""
        for agent in self.agents:
            agent.reset()
            
    def act(self, observations, add_noise=False):
        """Picks an action for each agent given."""
        actions = []
        for agent, observation in zip(self.agents, observations):
            action = agent.act(observation, add_noise=add_noise)
            actions.append(action)
        return np.array(actions)
    
    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory."""
        states = states.reshape(1, -1)
        actions = actions.reshape(1, -1)
        next_states = next_states.reshape(1, -1)
        
        self.memory.add(states, actions, rewards, next_states, dones)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep%LEARNING_PERIOD == 0:
            for a_i, agent in enumerate(self.agents):
                experiences = self.memory.sample()
                self.learn(experiences, a_i)
            
    def learn(self, experiences, agent_number):
        """ The critic takes as its input the combined observations and 
        actions from all agents. Collect actions from each agent for the 'experiences'. """
        next_actions = []
        actions_pred = []
        states, _, _, next_states, _ = experiences
        
        next_states = next_states.reshape(-1, self.num_agents, self.state_size)
        states = states.reshape(-1, self.num_agents, self.state_size)
        
        for a_i, agent in enumerate(self.agents):
            agent_id_tensor = self._get_agent_number(a_i)
            
            state = states.index_select(1, agent_id_tensor).squeeze(1)
            next_state = next_states.index_select(1, agent_id_tensor).squeeze(1)
            
            next_actions.append(agent.actor_target(next_state))
            actions_pred.append(agent.actor_local(state))
            
        next_actions = torch.cat(next_actions, dim=1).to(device)
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        
        agent = self.agents[agent_number]
        agent.learn(experiences, next_actions, actions_pred)
                
    def _get_agent_number(self, i):
        """Helper to get an agent's number as a Torch tensor."""
        return torch.tensor([i]).to(device)
class MADDPG():
    """Agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self,
                 state_size=24,
                 action_size=2,
                 n_agents=2,
                 buffer_size=100000,
                 batch_size=256,
                 gamma=0.999,
                 update_every=4,
                 noise_start=1.0,
                 noise_decay=1.0,
                 t_stop_noise=30000,
                 seed=0):
        """
        Params
        ======
            action_size (int): dimension of each action
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            t_stop_noise (int): max number of timesteps with noise applied in training
            seed (int): Random seed
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.noise_on = True
        self.t_stop_noise = t_stop_noise

        #         models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(i, state_size, action_size, n_agents) for i in range(n_agents)
        ]
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        if self.t_step > self.t_stop_noise:
            self.noise_on = False

        self.t_step += 1
        if self.t_step % self.update_every == 0 and len(
                self.memory) > self.batch_size:
            experiences = [self.memory.sample() for _ in range(self.n_agents)]
            self.learn(experiences, self.gamma)

    def act(self, all_states, add_noise=True):
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state,
                               noise_weight=self.noise_weight,
                               add_noise=self.noise_on)
            self.noise_weight *= self.noise_decay
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(), f"actor_agent{i}.pth")
            torch.save(agent.critic_local.state_dict(), f"critic_agent{i}.pth")
            torch.save(agent.actor_local.state_dict(), model_path.format('actor', i_episode))
            torch.save(agent.critic_local.state_dict(), model_path.format('critic', i_episode))
            np.save('scores_{0}.npy'.format(i_episode), scores)

        if np.mean(scores_window) >= success_score:
            tag = 'success'
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100,
                                                                                         np.mean(scores_window)))
            torch.save(agent.actor_local.state_dict(), model_path.format('actor', tag))
            torch.save(agent.critic_local.state_dict(), model_path.format('critic', tag))
            np.save('scores_{0}.npy'.format(tag), score)
            break
    return list_scores


if __name__ == '__main__':

    print(opt)
    env = UnityEnvironment(file_name="Tennis.app")
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    state_size, action_size = get_settings(env_info, brain)

    memory = ReplayBuffer(action_size, opt.buffer_size, opt.batch_size, opt.seed)

    agent = Agent(state_size, action_size, opt.seed, opt.buffer_size, opt.batch_size, opt.gamma, opt.tau, opt.lr_actor,
                   opt.lr_critic, opt.weight_decay)
    scores = train_agent(env, agent, brain_name, opt.model_path, opt.n_episodes, opt.success_score)
    env.close()
Ejemplo n.º 17
0
class MultiAgent:
    """Meta agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self, config):
        self.config = config
        self.n_agents = config.env.n_agents
        self.ddpg_agents = [
            Agent(i, config) for i in range(self.config.env.n_agents)
        ]
        # the shared replay buffer
        self.memory = ReplayBuffer(config)
        self.t_step = 0

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()

    def step(self, states, actions, rewards, next_states, dones):
        states = states.reshape(1, -1)
        next_states = next_states.reshape(1, -1)
        self.memory.add(states, actions, rewards, next_states, dones)

        self.t_step = (self.t_step + 1) % self.config.hp.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.hp.batch_size:
                for _ in range(self.config.hp.num_updates):
                    # each agent does it's own sampling from the replay buffer
                    experiences = [
                        self.memory.sample()
                        for _ in range(self.config.env.n_agents)
                    ]
                    self.learn(experiences, self.config.hp.gamma)

    def act(self, states, add_noise=True):
        # pass each agent's state from the environment and calculate it's action
        all_actions = []
        for agent, state in zip(self.ddpg_agents, states):
            action = agent.act(state, add_noise=True)
            all_actions.append(action)
        return np.array(all_actions).reshape(
            1, -1)  # reshape 2x2 into 1x4 dim vector

    def learn(self, experiences, gamma):
        # each agent uses it's own actor to calculate next_actions
        all_next_actions = []
        for i, agent in enumerate(self.ddpg_agents):
            _, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(self.config.general.device)
            next_state = next_states.reshape(-1, self.config.env.action_size, self.config.env.state_size) \
                        .index_select(1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        # each agent uses it's own actor to calculate actions
        all_actions = []
        for i, agent in enumerate(self.ddpg_agents):
            states, _, _, _, _ = experiences[i]
            agent_id = torch.tensor([i]).to(self.config.general.device)
            state = states.reshape(-1, self.config.env.action_size, self.config.env.state_size)\
                    .index_select(1, agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)

        # each agent learns from it's experience sample
        for i, agent in enumerate(self.ddpg_agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)
Ejemplo n.º 18
0
class MADDPG():
    def __init__(self, action_size=2, seed=42, n_agents=2):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            n_agents (int): number of distinct agents
        """

        self.n_agents = n_agents
        self.timestep = 0

        self.agents = [DDPG(i) for i in range(n_agents)]

        # common buffer for both the agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        all_next_states = all_next_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        self.timestep += 1
        if self.timestep % 2 == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # sample from the replay buffer for each agent
                experiences = [
                    self.memory.sample() for _ in range(self.n_agents)
                ]
                self.learn(experiences, GAMMA)

    def act(self, all_states, add_noise=True):
        # calculate each agents action
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, noise_weight=0.5, add_noise=True)
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        # each agent uses its own actor to calculate next_actions
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            # extract agent i's state and get action via actor network
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            # extract agent i's next state and get action via target actor network
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        # each agent learns from its experience sample
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        # save models
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoint_actor_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoint_critic_{i}.pth")
Ejemplo n.º 19
0
def train(
        env_location,
        curve_path,
        n_episodes=1000,
        batch_size=512,
        buffer_size=int(1e6),
):

    env = UnityEnvironment(file_name=env_location)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    logger.info(f'Number of agents: {num_agents}')

    # size of each action
    action_size = brain.vector_action_space_size
    logger.info(f'Size of each action: {action_size}')

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    logger.info(
        'There are {} agents. Each observes a state with length: {}'.format(
            states.shape[0], state_size))
    logger.info(f'The state for the first agent looks like: {states[0]}')

    # reset the environment

    # Replay memory
    random_seed = 2
    memory0 = ReplayBuffer(action_size, buffer_size, batch_size, random_seed)
    memory1 = memory0

    def create_agent(memory):
        return Agent(state_size=states.shape[1],
                     action_size=brain.vector_action_space_size,
                     random_seed=random_seed,
                     memory=memory,
                     batch_size=batch_size)

    agent0 = create_agent(memory0)
    agent1 = create_agent(memory1)

    def ddpg(n_episodes, average_window=100, plot_every=4):
        scores_deque = deque(maxlen=average_window)
        scores_all = []
        average_scores_all = []

        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            states = np.array(
                env_info.vector_observations,
                copy=True)  # get the current state (for each agent)
            agent0.reset()
            agent1.reset()
            scores = np.zeros(
                num_agents)  # initialize the score (for each agent)

            while True:
                action0 = agent0.act(states[0])
                action1 = agent1.act(states[1])
                actions = np.concatenate((action0, action1))

                env_info = env.step(actions)[
                    brain_name]  # send all actions to tne environment
                next_states = env_info.vector_observations  # get next state (for each agent)
                rewards = env_info.rewards  # get reward (for each agent)
                dones = env_info.local_done  # see if episode finished

                memory0.add(states[0], action0, rewards[0], next_states[0],
                            dones[0])
                memory1.add(states[1], action1, rewards[1], next_states[1],
                            dones[1])

                agent0.step()
                agent1.step()

                scores += env_info.rewards  # update the score (for each agent)
                states = next_states  # roll over states to next time step
                any_done = np.any(dones)
                assert any_done == np.all(dones)
                if any_done:  # exit loop if episode finished
                    break

            score_episode = np.max(scores)
            best_agent = np.argmax(scores)
            scores_deque.append(score_episode)
            scores_all.append(score_episode)
            average_score_queue = np.mean(scores_deque)
            average_scores_all.append(average_score_queue)

            logger.info(
                '\rEpisode {}\tScore: {:.4f}\tBest Agent: {}\tAverage Score: {:.4f}'
                .format(i_episode, score_episode, best_agent,
                        average_score_queue))
            torch.save(agent0.actor_local.state_dict(),
                       'checkpoint_actor0.pth')
            torch.save(agent0.critic_local.state_dict(),
                       'checkpoint_critic0.pth')
            torch.save(agent1.actor_local.state_dict(),
                       'checkpoint_actor1.pth')
            torch.save(agent1.critic_local.state_dict(),
                       'checkpoint_critic1.pth')
            if i_episode > average_window and average_score_queue > 1.0:
                break

            if i_episode % plot_every == 0:
                plot_curve(scores_all, average_scores_all)

        return scores_all, average_scores_all

    scores, average_scores = ddpg(n_episodes=n_episodes)
    plot_curve(scores, average_scores)

    env.close()

    return np.max(average_scores)
Ejemplo n.º 20
0
def train(env_location, curve_path, n_episodes=1000):
    env = UnityEnvironment(file_name=env_location)

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    logger.info(f'Number of agents: {num_agents}')

    # size of each action
    action_size = brain.vector_action_space_size
    logger.info(f'Size of each action: {action_size}')

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    logger.info('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    logger.info(f'The state for the first agent looks like: {states[0]}')

    # reset the environment

    # Replay memory
    BUFFER_SIZE = int(1e6)  # replay buffer size
    BATCH_SIZE = 1024        # minibatch size
    random_seed = 2
    memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

    def create_agent():
        return Agent(state_size=states.shape[1], action_size=brain.vector_action_space_size, random_seed=random_seed, memory=memory, batch_size=BATCH_SIZE)

    agent = create_agent()

    def ddpg(n_episodes, average_window=100, plot_every=4):
        scores_deque = deque(maxlen=100)
        scores_all = []

        for i_episode in range(1, n_episodes+1):
            env_info = env.reset(train_mode=True)[brain_name]
            states = np.array(env_info.vector_observations, copy=True)                  # get the current state (for each agent)
            agent.reset()
            scores = np.zeros(num_agents)                          # initialize the score (for each agent)

            while True:
                actions = agent.act(states)

                actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
                env_info = env.step(actions)[brain_name]           # send all actions to tne environment
                next_states = env_info.vector_observations         # get next state (for each agent)
                rewards = env_info.rewards                         # get reward (for each agent)
                dones = env_info.local_done                        # see if episode finished

                # Add experience to replay buffer for all agents
                for i in range(num_agents):
                    reward = rewards[i] # temporarily rename
                    next_state = next_states[i] # temporarily rename
                    done = dones[i] # temporarily rename
                    action = actions[i]
                    memory.add(states[i], action, reward, next_state, done)

                agent.step()

                scores += env_info.rewards                         # update the score (for each agent)
                states = next_states                               # roll over states to next time step
                any_done = np.any(done)
                assert any_done == np.all(done)
                if any_done:                                  # exit loop if episode finished
                    break

            average_score_episode = np.mean(scores)
            scores_deque.append(average_score_episode)
            scores_all.append(average_score_episode)
            average_score_queue = np.mean(scores_deque)

            logger.info('\rEpisode {}\tScore: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, average_score_episode, average_score_queue))
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor2.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic2.pth')
            if i_episode > 100 and average_score_queue > 30:
                break

            if i_episode % plot_every == 0:
                plot_curve(scores_all)

        return scores_all

    scores = ddpg(n_episodes=n_episodes)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores)+1), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.savefig('learning.curve.png')

    env.close()