コード例 #1
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
コード例 #2
0
    def update(self, samples, agent_number, log):
        """update the critics and actors of all the agents """

        obs, obs_full, action, reward, next_obs, next_obs_full, done = zip(
            *samples)

        #input needs to be transposed from [samples of (agent 1 obs, agent 2 obs)] to [agent 1 (sample obs), agent 2 (sample obs)]
        agent1_next_obs, agent2_next_obs = zip(*next_obs)
        agent1_obs, agent2_obs = zip(*obs)
        obs = [
            torch.from_numpy(np.vstack(agent1_obs)).float().to(device),
            torch.from_numpy(np.vstack(agent2_obs)).float().to(device)
        ]
        done = transpose_to_nested_list(done)
        reward = transpose_to_nested_list(reward)

        # always update agent 0, then copy the networks to the other agent(s)
        agent = self.maddpg_agent[0]
        agent.critic_optimizer.zero_grad()

        #critic loss = batch mean of (y- Q(s,a) from target network)^2
        #y = reward of this timestep + discount * Q(st+1,at+1) from target network
        target_actions = self.target_act([agent1_next_obs, agent2_next_obs])

        #stack the various agents actions and observations to input to the critic
        target_actions = torch.cat(target_actions, 1)
        next_obs_full = torch.from_numpy(
            np.vstack(next_obs_full)).float().to(device)
        target_critic_input = torch.cat((next_obs_full, target_actions),
                                        1).to(device)

        with torch.no_grad():
            q_next = self.maddpg_agent[0].target_critic(target_critic_input)

        y = torch.from_numpy(reward[agent_number]).float().to(
            device) + self.discount_factor * q_next * (1 - torch.from_numpy(
                done[agent_number].astype(np.uint8)).float().to(device))

        action = transpose_to_tensor(action)
        action = torch.cat(list(action), dim=1)
        critic_input = torch.cat(
            (torch.from_numpy(np.vstack(obs_full)).float().to(device), action),
            1).to(device)
        q = agent.critic(critic_input)

        critic_loss = torch.nn.functional.mse_loss(q, y.detach())
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0)
        agent.critic_optimizer.step()

        #update actor network using policy gradient
        agent.actor_optimizer.zero_grad()
        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [ self.maddpg_agent[i].actor(ob) if i == agent_number \
                   else self.maddpg_agent[i].actor(ob).detach()
                   for i, ob in enumerate(obs) ]

        q_input = torch.cat(q_input, dim=1)
        obs_full = torch.from_numpy(np.vstack(obs_full)).float().to(device)
        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already
        q_input2 = torch.cat((obs_full, q_input), dim=1)

        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0)
        agent.actor_optimizer.step()

        self.copy_actor(0)  # copy updated actor network to the other agents

        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
        #log.info('agent{}: critic loss\t{}\tactor loss\t{}'.format(agent_number,cl,al))
        return [al, cl]
コード例 #3
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
コード例 #4
0
    def update(self, samples, agent_number, logger):
        """update the critics and actors of all the agents"""

        # `samples`: a list of batchsize, List(5,)
        # `states` & next_states: a list of batchsize, Array(2,24)
        # `actions`: a list of batchsize, Array(2,2)
        # `rewards` & `dones`: a list of batch size, List(2,)
        states, actions, rewards, next_states, dones = zip(*samples)

        # -------------------------- preprocessing -------------------------- #
        # `states` & `next_states`: a list of size 2, Tensor(batchsize,24)
        states = transpose_to_tensor(states)
        next_states = transpose_to_tensor(next_states)

        # `states_full` & `next_states_full`: Tensor(batchsize,48)
        states_full = torch.cat(states, dim=1)
        next_states_full = torch.cat(next_states, dim=1)

        # `actions`: Tensor(batchsize,4)
        actions = transpose_to_tensor(actions)
        actions = torch.cat(actions, dim=1)

        # `dones` & `rewards`: a list of 2, Tensor(batchsize,)
        dones = transpose_to_tensor(transpose_list(zip(*dones)))
        rewards = transpose_to_tensor(transpose_list(zip(*rewards)))

        # -------------------------- update critic -------------------------- #
        agent = self.maddpg_agent[agent_number]
        agent.critic_optimizer.zero_grad()

        # critic loss = batch mean of (y - Q(s,a) from target network)^2
        # y = current reward + discount * Q(st+1,at+1) from target network
        target_actions = self.target_act(next_states)
        target_actions = torch.cat(target_actions, dim=-1)
        target_critic_input = torch.cat((next_states_full, target_actions),
                                        dim=1).to(device)

        with torch.no_grad():
            q_next = agent.target_critic(target_critic_input)

        y = rewards[agent_number].view(-1, 1) + \
            self.discount_factor * q_next * \
            (1 - dones[agent_number].view(-1, 1))
        critic_input = torch.cat((states_full, actions), dim=1).to(device)
        q = agent.critic(critic_input)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(q, y.detach())
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5)
        agent.critic_optimizer.step()

        # -------------------------- update actor -------------------------- #
        agent.actor_optimizer.zero_grad()
        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [ self.maddpg_agent[i].actor(state) if i == agent_number \
                   else self.maddpg_agent[i].actor(state).detach()
                   for i, state in enumerate(states) ]

        q_input = torch.cat(q_input, dim=1)
        # combine all the actions and observations for input to critic
        q_input2 = torch.cat((states_full, q_input), dim=1)

        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.actor.parameters(),0.5)
        agent.actor_optimizer.step()

        # for TensorBoard
        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
        logger.add_scalars('agent%i/losses' % agent_number, {
            'critic loss': cl,
            'actor_loss': al
        }, self.iter)
コード例 #5
0
def main():
    seeding()
    parallel_envs = 4
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    save_interval = 1000
    t = 0

    # amplitude of OU noise, which slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    """
    `env` controls three agents, two blue, one red.
    env.observation_space: [Box(14,), Box(14,), Box(14,)]
    env.action_sapce: [Box(2,), Box(2,), Box(2,)]
    Box(14,) can be broken down into 2+3*2+3*2=14
    (2) location coordinates of the target landmark
    (3*2) the three agents' positions w.r.t. the target landmark
    (3*2) the three agents' velocities w.r.t. the target landmark
    """
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        # Consult `env_wrapper.py` line 19.
        all_obs = env.reset()
        """
        `all_abs` is a list of size `parallel_envs`,
        each item in the list is another list of size two,
        first is env.observation_space: [Box(14,), Box(14,), Box(14,)],
        second is [Box(14,)], which is added to faciliate training
        https://goo.gl/Xtr6sF
        `obs` and `obs_full` are both lists of size `parallel_envs`,
        `obs` has the default observation space [Box(14,), Box(14,), Box(14,)]
        `obs_full` has the compounded observation space [Box(14,)]
        """
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for one episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of steps
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            # `actions_array` has shape (3, parallel_envs, 2)
            actions_array = torch.stack(actions).detach().numpy()
            # `actions_for_env` has shape (parallel_envs, 3, 2), because
            # input to `step` requires the first index to be `parallel_envs`
            actions_for_env = np.rollaxis(actions_array, axis=1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = \
                env.step(actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update the target network `parallel_envs`=4 times
        # after every `episode_per_update`=2*4
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            # update the local network for all agents, `a_i` refers to agent no.
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # soft update the target network towards the actual networks
            maddpg.update_targets()

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # Saves the model.
        save_dict_list = []
        if save_info:
            for i in range(3):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # Save gif files.
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
コード例 #6
0
def main():
    seeding()
    # number of training episodes.
    number_of_episodes = 5000
    episode_length = 1000
    batchsize = 2000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe')
    env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe',
                           no_graphics=True)

    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    env_info = env.reset(train_mode=True)[brain_name]

    num_agents = len(env_info.agents)

    replay_episodes = 1000

    buffer = ReplayBuffer(int(replay_episodes * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    # logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []

    # training loop
    scores_deque = deque(maxlen=100)
    scores = []

    for episode in range(0, number_of_episodes):

        reward_this_episode = np.zeros(num_agents)
        env_info = env.reset(True)[brain_name]
        state = env_info.vector_observations

        obs = [[state[0], state[1]]]
        obs_full = np.concatenate((state[0], state[1]))

        #for calculating rewards for this particular episode - addition of all time steps

        frames = []
        tmax = 0

        for episode_t in range(episode_length):

            t += 1

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # actions_for_env = np.rollaxis(actions_array,1)
            actions_for_env = np.clip(actions_array.flatten(), -1, 1)

            # print(actions_for_env)

            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            env_info = env.step(actions_for_env)[brain_name]
            next_state = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            next_obs = [[next_state[0], next_state[1]]]
            next_obs_full = np.concatenate((next_state[0], next_state[1]))

            # print(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones

            # add data to buffer
            transition = ([obs], [obs_full], [actions_for_env], [rewards],
                          [next_obs], [next_obs_full], [dones])

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            if any(dones):
                break

        # update once after every episode_per_update
        if len(buffer) > batchsize and episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        avg_rewards = np.mean(reward_this_episode, axis=0)
        episode_reward = np.max(avg_rewards)
        scores_deque.append(episode_reward)
        scores.append(episode_reward)

        print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'.
              format(episode, np.mean(scores_deque), episode_reward),
              end="")

        if (episode > 0
                and episode % 100 == 0) or episode == number_of_episodes - 1:
            print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'.
                  format(episode, np.mean(scores_deque), episode_reward))

        if np.mean(scores_deque) >= 0.5:
            print('\nSuccess!')
            break

    #saving model
    save_dict_list = []
    for i in range(num_agents):

        save_dict = {
            'actor_params':
            maddpg.maddpg_agent[i].actor.state_dict(),
            'actor_optim_params':
            maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
            'critic_params':
            maddpg.maddpg_agent[i].critic.state_dict(),
            'critic_optim_params':
            maddpg.maddpg_agent[i].critic_optimizer.state_dict()
        }
        save_dict_list.append(save_dict)

        torch.save(save_dict_list,
                   os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

    env.close()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.plot(np.arange(1, len(scores) + 1), scores)
    plt.savefig('tennis_score_history.png')

    return scores
コード例 #7
0
def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5

    # initialize environment
    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs,
                                 seed=SEED,
                                 num_agents=num_agents,
                                 benchmark=BENCHMARK)

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])

    # trained_checkpoint = r'E:\Ivan\UPC\UDACI TY\DRL_Nanodegree\Part4\MADDPG\032521_163018\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032521_211315\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032621_054252\model_dir\episode-36000.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032821_102717\model_dir\episode-99000.pt' #test1 6 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032921_160324\model_dir\episode-99000.pt' #test2 6 agents pretrined
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\033021_203450\model_dir\episode-73002.pt'  # test2 6 agents pretrined
    # trained_checkpoint = "c4-a4-n01-a01-old-two-noskip/model_dir/episode-59999.pt"
    trained_checkpoint = "gat-huge1k/model_dir/episode-99000.pt"
    aux = torch.load(trained_checkpoint, map_location=torch.device('cpu'))

    for i in range(num_agents):
        # load the weights from file
        maddpg.maddpg_agent[i].actor.load_state_dict(aux[i]['actor_params'])
        maddpg.maddpg_agent[i].critic.load_state_dict(aux[i]['critic_params'])

    # Reset the environment
    all_obs = env.reset()
    # flip the first two indices
    obs_roll = np.rollaxis(all_obs, 1)
    obs = transpose_list(obs_roll)

    scores = 0
    t = 0
    while True:
        all_obs = env.reset()
        # flip the first two indices
        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)
        scores = 0
        t = 0
        for _ in range(25):
            env.render('rgb_array')
            time.sleep(0.1)
            t += 1
            # select an action
            actions = maddpg.act(transpose_to_tensor(obs), noise=0.)
            actions_array = torch.stack(actions).detach().numpy()
            actions_for_env = np.rollaxis(actions_array, 1)
            # send all actions to the environment
            next_obs, rewards, dones, info = env.step(actions_for_env)
            # update the score (for each agent)
            scores += np.sum(rewards)
            print('\r\n Rewards at step %i = %.3f' % (t, scores))
            # for displaying learned policies

            # time.sleep(0.1)
            # env.render()

            # roll over states to next time step
            obs = next_obs
            # print("Score: {}".format(scores))
            if np.any(dones):
                print('done')
                print('Next:')
    env.close()
コード例 #8
0
def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 60000
    episode_length = 35
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0
    scenario_name = "simple_spread_ivan"

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 0.5  # was 2, try 0.5, 0.2
    noise_reduction = 0.9999  # 0.999
    #### DECAY
    initial_noise = 0.1
    decay = 0.01

    # how many episodes before update
    # episode_per_update = UPDATE_EVERY * parallel_envs
    common_folder = time.strftime("/%m%d%y_%H%M%S")
    log_path = os.getcwd() + common_folder + "/log"
    model_dir = os.getcwd() + common_folder + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # initialize environment
    # torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK)
    # env = envs.make_env("simple_spread_ivan")

    # initialize replay buffer
    buffer = ReplayBuffer(int(BUFFER_SIZE))

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    logger = SummaryWriter(log_dir=log_path)

    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])
    # agent0_reward = []
    # agent1_reward = []
    # agent2_reward = []

    agent_info = [[[]]]  # placeholder for benchmarking info

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        '\repisode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    print('Starting iterations...')
    for episode in range(0, number_of_episodes, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, num_agents))

        all_obs = env.reset()  #

        # flip the first two indices
        # ADD FOR WITHOUT PARALLEL ENV
        # all_obs = np.expand_dims(all_obs, axis=0)

        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        # if save_info:
        # frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            # get actions
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)

            noise = max(initial_noise * decay**(episode_t / 20000), 0.001)
            # noise = max(noise*noise_reduction, 0.001)

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # environment step
            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            # ADD FOR WITHOUT PARALLEL ENV
            # next_obs, rewards, dones, info = env.step(actions_for_env)
            next_obs, rewards, dones, info = env.step(actions_for_env)

            # rewards_sum += np.mean(rewards)

            # collect experience
            transition = (obs, actions_for_env, rewards, next_obs, dones)
            buffer.push(transition)

            reward_this_episode += rewards

            # obs, obs_full = next_obs, next_obs_full
            obs = next_obs

            # increment global step counter
            t += parallel_envs

            # save gif frame
            if save_info:
                # frames.append(env.render('rgb_array'))
                tmax += 1

            # for benchmarking learned policies
            if BENCHMARK:
                for i, inf in enumerate(info):
                    agent_info[-1][i].append(inf['n'])

        # update once after every episode_per_update
        # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs:
        if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs:
            for _ in range(UPDATE_TIMES):
                for a_i in range(num_agents):
                    samples = buffer.sample(BATCH_SIZE)
                    maddpg.update(samples, a_i, logger)
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            for n in range(num_agents):
                agents_reward[n].append(reward_this_episode[i, n])
            # agent0_reward.append(reward_this_episode[i,0])
            # agent1_reward.append(reward_this_episode[i,1])
            # agent2_reward.append(reward_this_episode[i,2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)]
            avg_rewards = []
            for n in range(num_agents):
                avg_rewards.append(np.mean(agents_reward[n]))
                # agent0_reward = []
            # agent1_reward = []
            # agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            print('agent_info benchmark=', agent_info)
            for i in range(5):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #                 frames, duration=.04)

    env.close()
    logger.close()
    timer.finish()