Exemple #1
0
    def learn(self, gamma):
        """Learn from experiences"""
        actor_losses = []
        critic_losses = []
        self.learn_step += 1
        for i in range(self.num_agents):
            experiences = self.memory.sample()
            actor_loss, critic_loss = self._learn(
                experiences, gamma, self.actor_local[i], self.actor_target[i],
                self.critic_local[i], self.critic_target[i],
                self.actor_optimizer[i], self.critic_optimizer[i])
            actor_losses.append(actor_loss)
            critic_losses.append(critic_loss)

        if self.learn_step % self.print_every == 0:
            self.writer.text('critic loss: {}'.format(np.mean(critic_losses)),
                             "Critic Multi Agent")
            save_to_txt(np.mean(critic_losses),
                        '{}/critic_losses_multi.txt'.format(self.dirname))
            self.writer.push(np.mean(critic_losses), "Loss(critic)")
            self.writer.text('actor loss: {}'.format(np.mean(actor_losses)),
                             "Actor Multi Agent")
            save_to_txt(np.mean(actor_losses),
                        '{}/actor_losses_multi.txt'.format(self.dirname))
            self.writer.push(np.mean(actor_losses), "Loss(actor)")
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--num_episodes", type=int, default=1000, help="Total number of episodes to train")
    parser.add_argument("--max_t", type=int, default=1000, help="Max timestep in a single episode")
    parser.add_argument("--vis", type=bool, default=True, help="Whether to use visdom to visualise training")
    parser.add_argument("--model", type=str, default=None, help="Model checkpoint path, use if you wish to continue training from a checkpoint")
    parser.add_argument("--info", type=str, default="", help="Use this to attach notes to your runs")
    parser.add_argument("--stop_on_solve", type=bool, default=True, help="Stop as soon as the environment is solved")

    args = parser.parse_args()

    # visualiser
    writer = VisWriter(vis=args.vis)
    # save info/comments about the experiment
    save_to_txt(args.info, '{}/info.txt'.format(dirname))

    # Unity Env
    env = UnityEnvironment(file_name='env/Tennis_Linux_NoVis/Tennis.x86_64')    
    # brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    state = env_info.vector_observations
    state_shape = state.shape[1]
    action_size = brain.vector_action_space_size    

    agent = DDPGMultiAgent(state_shape, action_size, num_agents, writer=writer, random_seed=10, dirname=dirname, print_every=100, model_path=args.model)
    scores = ddpg(env, brain_name, num_agents, agent, writer, n_episodes=args.num_episodes, max_t=args.max_t, stop_on_solve=args.stop_on_solve)
    # save all scores
    save_to_txt('\n'.join([score.tolist() for score in scores]), '{}/scores_multi_full.txt'.format(dirname))
Exemple #3
0
    def step(self, states, actions, rewards, next_states, dones):
        """Performs the learning step.

        Save experience in replay memory, and sample uniformly at random
        from buffer to learn.
        """
        self.learn_step += 1
        # store a single entry for each step i.e the experience of
        # each agent for a step gets stored as single entry.
        states = np.expand_dims(states, 0)
        actions = np.expand_dims(
            np.array(actions).reshape(self.num_agents, self.action_size), 0)
        rewards = np.expand_dims(
            np.array(rewards).reshape(self.num_agents, -1), 0)
        dones = np.expand_dims(np.array(dones).reshape(self.num_agents, -1), 0)
        next_states = np.expand_dims(
            np.array(next_states).reshape(self.num_agents, -1), 0)
        # Use debugger to explore the shape
        # import pdb; pdb.set_trace()
        self.memory.add(states, actions, rewards, next_states, dones)

        # Get agent to learn from experience if we have enough data/experiences in memory
        if len(self.memory) < self.config['BATCH_SIZE']:
            return
        if not self.learn_step % self.config['LEARN_STEP'] == 0:
            return
        experiences = self.memory.sample()
        actor_losses = []
        critic_losses = []
        for agent in self.agents:
            actor_loss, critic_loss = agent.learn(self.agents, experiences,
                                                  self.config['GAMMA'])
            actor_losses.append(actor_loss)
            critic_losses.append(critic_loss)

        # Plot real-time graphs and store losses
        if self.learn_step % self.print_every == 0:
            # Save Critic loss
            save_to_txt(critic_losses,
                        '{}/critic_losses.txt'.format(self.dirname))
            self.writer.text('critic loss: {}'.format(critic_losses), "Critic")
            self.writer.push(critic_losses, "Loss(critic)")
            # Save Actor loss
            save_to_txt(actor_losses,
                        '{}/actor_losses.txt'.format(self.dirname))
            self.writer.text('actor loss: {}'.format(actor_losses), "Actor")
            self.writer.push(actor_losses, "Loss(actor)")
Exemple #4
0
    def step(self, states, actions, rewards, next_states, dones):
        """Performs the learning step.
        """
        # store a single entry for results from all agents by adding axis=0
        states, actions, rewards, next_states, dones = self.reshape(
            states, actions, rewards, next_states, dones)
        self.memory.add(states, actions, rewards, next_states, dones)

        # Get agent to learn from experience if we have enough data/experiences in memory
        if len(
                self.memory
        ) > self.batch_size and self.learn_step % self.update_every == 0:

            experiences = self.memory.sample()
            actor_losses = []
            critic_losses = []

            for agent in self.agents:
                actor_loss, critic_loss = agent.learn(self.agents, experiences,
                                                      self.gamma)
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

            # Plot real-time graphs and store losses
            if self.learn_step % self.print_every == 0:
                # Save Critic loss
                utils.save_to_txt(
                    critic_losses,
                    '{}/critic_losses.txt'.format(self.result_dir))
                self.writer.text('critic loss: {}'.format(critic_losses),
                                 'Critic')
                self.writer.push(critic_losses, 'Loss(critic)')
                # Save Actor loss
                utils.save_to_txt(
                    actor_losses,
                    '{}/actor_losses.txt'.format(self.result_dir))
                self.writer.text('actor loss: {}'.format(actor_losses),
                                 'Actor')
                self.writer.push(actor_losses, 'Loss(actor)')

            self.critic_loss = np.array(critic_losses).mean()
            self.actor_loss = np.array(actor_losses).mean()
            self.learn_step += 1

        return self.critic_loss, self.actor_loss
Exemple #5
0
def maddpg(env,
           brain_name,
           num_agents,
           agent,
           writer,
           n_episodes=300,
           max_t=1000,
           print_every=50,
           stop_on_solve=True):
    """Train DDPG Agent

    Params    
    ======
        env (object): Unity environment instance
        brain_name (string): name of brain
        num_agents (int): number of agents
        agent (DDPGMultiAgent): agent instance
        writer (VisWriter): Visdom visualiser for realtime plots
        n_episodes (int): number of episodes to train the network
        max_t (int): number of timesteps in each episode
        print_every (int): how often to print the progress
        stop_on_solve (bool): whether to stop training as soon as environment is solved
    """
    best_score = -np.inf
    scores_deque = deque(maxlen=100)
    maxt_deque = deque(maxlen=20)
    best_maxt = 0
    scores = []
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        agent.reset()
        score = np.zeros(num_agents)
        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[
                brain_name]  # send all actions to tne environment
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished
            score += env_info.rewards  # update the score (for each agent)
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states  # roll over states to next time step
            if np.any(dones):
                logger.debug('Episode {} done at t = {}'.format(i_episode, t))
                maxt_deque.append(t)
                if t >= best_maxt:
                    best_maxt = t

                break  # exit loop if episode finished

        scores_deque.append(np.max(score))
        scores.append(np.max(score))
        current_score = np.mean(scores_deque)
        # keep storing current score (incase we terminate, we'll have data for plotting/comparison)
        save_to_txt(current_score, '{}/scores_multi.txt'.format(dirname))
        # Publish and save
        writer.text(
            'Episode {}/{}: Average score(100): {}'.format(
                i_episode, n_episodes, current_score), "Average 100 episodes")
        writer.push(np.mean(scores_deque), "Average Score")
        logger.info(
            'Episode {}\tAverage Score: {:.2f}, Average max_t: {:.2f}, Best max_t: {}'
            .format(i_episode, current_score, np.mean(maxt_deque), best_maxt))

        if len(scores) > 0:
            writer.push(scores[-1], "Score")

        # if current_score >= best_score:
        if current_score > best_score:
            logger.info('Best score found, old: {}, new: {}'.format(
                best_score, current_score))
            best_score = current_score
            agent.checkpoint()

        if i_episode % print_every == 0:
            logger.info('Episode {}\tAverage Score: {:.2f}'.format(
                i_episode, current_score))

        # check environment solved
        if current_score >= 0.5:
            logger.info('Environment solved in {} episodes'.format(i_episode))
            if stop_on_solve:
                logger.info('Terminating agent training')
                break

    logger.info('Final Average Score: {:.2f}'.format(current_score))
    return scores
Exemple #6
0
def train(
    agent_group: mag.MADDPGAgentGroup,
    env: UnityEnvironment,
    brain_name: int,
    num_agents: int,
    writer: utils.VisWriter,
    result_dir: str,
    logger,
    num_episodes: int = 10000,
    max_t: int = 5000,
    print_every: int = 100,
    passing_score: float = 0.5,
):
    scores_deque = deque(maxlen=print_every)
    max_t_deque = deque(maxlen=print_every)

    i_episode = 0
    scores = []
    current_t = 0
    best_max_t = 0

    for i_episode in range(1, num_episodes + 1):
        # reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations

        score = np.zeros(num_agents)
        agent_group.reset()

        critic_loss, actor_loss = 0, 0
        t = 0
        for t in range(max_t):
            # agent acts
            actions = agent_group.act(states)

            if current_t % print_every == 0:
                for i in range(actions[0].shape[0]):
                    action_from_dim = [a[i] for a in actions]
                    writer.push(action_from_dim, f'Actions(dim-{i})')

            # receives feedback from env
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            score += rewards

            # agent explores or learns
            critic_loss, actor_loss = agent_group.step(states, actions,
                                                       rewards, next_states,
                                                       dones)

            states = next_states

            if np.any(dones):
                logger.debug('Episode {} done at t = {}'.format(i_episode, t))
                if t >= best_max_t:
                    best_max_t = t
                    max_t_deque.append(best_max_t)
                break

            current_t += 1

        max_score = np.max(score)
        scores_deque.append(max_score)
        scores.append(max_score)
        current_score = np.mean(scores_deque)

        # keep track of scores
        utils.save_to_txt(current_score, '{}/scores.txt'.format(result_dir))

        logger.info(
            f'Episode {i_episode}, score : {max_score:.3f}. Average score: {current_score:.3f}. (critic_loss: {critic_loss:.7f}, actor_loss:{actor_loss:.7f})'
        )

        # Publish and save
        writer.text(
            'Episode {}/{}: Average score(100): {}'.format(
                i_episode, num_episodes, current_score),
            'Average 100 episodes')
        writer.push(current_score, 'Average Score')
        logger.info(
            'Episode {}\tAverage Score: {:.2f}, Average max_t: {:.2f}, Best max_t: {}'
            .format(i_episode, current_score, np.mean(max_t_deque),
                    best_max_t))

        if len(scores) > 0:
            writer.push(scores[-1], 'Score')

        if current_score >= passing_score:
            logger.info(
                f'\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score: {np.mean(scores_deque):.4f}, passing score: {passing_score}. Saving models.'
            )
            break

        # save models
        agent_group.save()

    return scores