def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(
            np.array(
                transpose_list([e.state for e in experiences
                                if e is not None]))).float().to(device)
        actions = torch.from_numpy(
            np.array(
                transpose_list([
                    e.action for e in experiences if e is not None
                ]))).float().to(device)
        rewards = torch.from_numpy(
            np.array(
                transpose_list([
                    e.reward for e in experiences if e is not None
                ]))).float().to(device)
        next_states = torch.from_numpy(
            np.array(
                transpose_list([
                    e.next_state for e in experiences if e is not None
                ]))).float().to(device)
        dones = torch.from_numpy(
            np.array(
                transpose_list([e.done for e in experiences if e is not None
                                ])).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)
Exemple #2
0
    def sample(self, batchsize):
        """sample from the buffer"""
        samples = random.sample(self.memory, batchsize)

        print(len(transpose_list(samples)))

        # transpose list of list
        return transpose_list(samples)
Exemple #3
0
    def learn(self, experiences, agent_number):
        """update the critics and actors of all the agents """

        # need to transpose each element of the samples
        # to flip obs[parallel_agent][agent_number] to
        # obs[agent_number][parallel_agent]

        states, actions, rewards, next_states, dones = experiences

        agent = self.agents[agent_number]
        agent.critic_optimizer.zero_grad()

        #critic loss = batch mean of (y- Q(s,a) from target network)^2
        #y = reward of this timestep + discount * Q(st+1,at+1) from target network

        target_actions = self.target_act(next_states)
        target_actions = torch.cat(target_actions, dim=1)
        t = torch.tensor(transpose_list(next_states.cpu().data.numpy()))
        next_states_all = t.view(t.shape[0], -1).to('cpu')
        target_critic_input = torch.cat(
            (next_states_all, target_actions.to('cpu')), dim=1).to(device)

        with torch.no_grad():
            q_next = agent.target_critic(target_critic_input)

        y = rewards[agent_number].view(
            -1, 1) + GAMMA * q_next * (1 - dones[agent_number].view(-1, 1))
        actions_all = torch.cat(torch.unbind(actions), dim=1)
        t = torch.tensor(transpose_list(states.cpu().data.numpy()))
        states_all = t.view(t.shape[0], -1).to('cpu')
        critic_input = torch.cat((states_all, actions_all.to('cpu')),
                                 dim=1).to(device)
        q = agent.critic(critic_input)

        critic_loss = F.mse_loss(q, y.detach())
        critic_loss.backward(retain_graph=True)
        agent.critic_optimizer.step()

        # update actor network using policy gradient
        agent.actor_optimizer.zero_grad()

        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [self.agents[i].actor(state) if i == agent_number \
                   else self.agents[i].actor(state).detach()
                   for i, state in enumerate(states)]
        q_input = torch.cat(q_input, dim=1)

        # combine all the actions and observations for input to critic
        # many of the obs are redundant, and obs[1] contains all useful information already
        q_input2 = torch.cat((states_all.to('cpu'), q_input.to('cpu')), dim=1)

        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward(retain_graph=True)
        agent.actor_optimizer.step()
    def push(self, transition):
        """push into the buffer"""

        input_to_buffer = transpose_list(transition)

        for item in input_to_buffer:
            self.deque.append(item)
Exemple #5
0
 def push(self, transition, error):
     """push into the buffer"""
     input_to_buffer = transpose_list(transition)
     i = 0
     for item in input_to_buffer:
         p = self._getPriority(error[i])
         self.tree.add(p, item)
         i += 1
Exemple #6
0
    def sample(self, n):
        """sample from the buffer"""
        experiences = []
        indexes = []
        segment = self.tree.total() / n
        for i in range(n):
            a = segment * i
            b = segment * (i + 1)
            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            experiences.append(data)
            indexes.append(idx)

        # transpose list of list
        return transpose_list(experiences), indexes
    def sample(self, batch_size):
        """sample from the buffer"""
        samples = random.sample(self.deque, batch_size)

        # transpose list of list
        return transpose_list(samples)
Exemple #8
0
def main():
seeding()
# number of parallel agents
parallel_envs = 4
# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 1000
episode_length = 80
batchsize = 1000
# how many episodes to save policy and gif
save_interval = 1000
t = 0


# amplitude of OU noise
# this slowly decreases to 0
noise = 2
noise_reduction = 0.9999

# how many episodes before update
episode_per_update = 2 * parallel_envs

log_path = os.getcwd() + '/log'
model_dir = os.getcwd() + '/model_dir'

os.makedirs(model_dir, exist_ok=True)

torch.set_num_threads(parallel_envs)
env = envs.make_parallel_env(parallel_envs)

# keep 5000 episodes worth of replay
buffer = ReplayBuffer(int(5000 * episode_length))

# initialize policy and critic
maddpg = MADDPG()
logger = SummaryWriter(log_dir = log_path)
agent0_reward = []
agent1_reward = []
agent2_reward = []

# training loop
# show progressbar
import progressbar as pb
widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ',
		  pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ]

timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

# use keep_awake to keep workspace from disconnecting
# for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
for episode in range(0, number_of_episodes, parallel_envs):

	timer.update(episode)

	reward_this_episode = np.zeros((parallel_envs, 3))
	all_obs = env.reset()
	obs, obs_full = transpose_list(all_obs)

	# for calculating rewards for this particular episode - addition of all time steps

	# save info or not
	save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs)
	frames = []
	tmax = 0

	if save_info:
		frames.append(env.render('rgb_array'))
Exemple #9
0
import torch


def transpose_list(mylist):
    return list(map(list, zip(*mylist)))


def transpose_to_tensorAsitis(input_list):
    make_tensor = lambda x: torch.tensor(x, dtype=torch.float)
    return list(map(make_tensor, input_list))


env_info = env.step(actions)[brain_name]  # send all actions to tne environment
next_states = env_info.vector_observations
print(next_states)
s = transpose_list(next_states)
print(s)
p = np.concatenate((next_states[0], next_states[1]))
print(np.shape(p)[0])
s = transpose_to_tensorAsitis(next_states)
print("tensor", s)

# In[8]:

# main function that sets up environments
# perform training loop

#import envs
from buffer import ReplayBuffer
from maddpg import MADDPG
import torch
Exemple #10
0
def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5

    # initialize environment
    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs,
                                 seed=SEED,
                                 num_agents=num_agents,
                                 benchmark=BENCHMARK)

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])

    # trained_checkpoint = r'E:\Ivan\UPC\UDACI TY\DRL_Nanodegree\Part4\MADDPG\032521_163018\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032521_211315\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032621_054252\model_dir\episode-36000.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032821_102717\model_dir\episode-99000.pt' #test1 6 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032921_160324\model_dir\episode-99000.pt' #test2 6 agents pretrined
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\033021_203450\model_dir\episode-73002.pt'  # test2 6 agents pretrined
    # trained_checkpoint = "c4-a4-n01-a01-old-two-noskip/model_dir/episode-59999.pt"
    trained_checkpoint = "gat-huge1k/model_dir/episode-99000.pt"
    aux = torch.load(trained_checkpoint, map_location=torch.device('cpu'))

    for i in range(num_agents):
        # load the weights from file
        maddpg.maddpg_agent[i].actor.load_state_dict(aux[i]['actor_params'])
        maddpg.maddpg_agent[i].critic.load_state_dict(aux[i]['critic_params'])

    # Reset the environment
    all_obs = env.reset()
    # flip the first two indices
    obs_roll = np.rollaxis(all_obs, 1)
    obs = transpose_list(obs_roll)

    scores = 0
    t = 0
    while True:
        all_obs = env.reset()
        # flip the first two indices
        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)
        scores = 0
        t = 0
        for _ in range(25):
            env.render('rgb_array')
            time.sleep(0.1)
            t += 1
            # select an action
            actions = maddpg.act(transpose_to_tensor(obs), noise=0.)
            actions_array = torch.stack(actions).detach().numpy()
            actions_for_env = np.rollaxis(actions_array, 1)
            # send all actions to the environment
            next_obs, rewards, dones, info = env.step(actions_for_env)
            # update the score (for each agent)
            scores += np.sum(rewards)
            print('\r\n Rewards at step %i = %.3f' % (t, scores))
            # for displaying learned policies

            # time.sleep(0.1)
            # env.render()

            # roll over states to next time step
            obs = next_obs
            # print("Score: {}".format(scores))
            if np.any(dones):
                print('done')
                print('Next:')
    env.close()
    def update(self, samples, agent_number, logger):
        """update the critics and actors of all the agents"""

        # `samples`: a list of batchsize, List(5,)
        # `states` & next_states: a list of batchsize, Array(2,24)
        # `actions`: a list of batchsize, Array(2,2)
        # `rewards` & `dones`: a list of batch size, List(2,)
        states, actions, rewards, next_states, dones = zip(*samples)

        # -------------------------- preprocessing -------------------------- #
        # `states` & `next_states`: a list of size 2, Tensor(batchsize,24)
        states = transpose_to_tensor(states)
        next_states = transpose_to_tensor(next_states)

        # `states_full` & `next_states_full`: Tensor(batchsize,48)
        states_full = torch.cat(states, dim=1)
        next_states_full = torch.cat(next_states, dim=1)

        # `actions`: Tensor(batchsize,4)
        actions = transpose_to_tensor(actions)
        actions = torch.cat(actions, dim=1)

        # `dones` & `rewards`: a list of 2, Tensor(batchsize,)
        dones = transpose_to_tensor(transpose_list(zip(*dones)))
        rewards = transpose_to_tensor(transpose_list(zip(*rewards)))

        # -------------------------- update critic -------------------------- #
        agent = self.maddpg_agent[agent_number]
        agent.critic_optimizer.zero_grad()

        # critic loss = batch mean of (y - Q(s,a) from target network)^2
        # y = current reward + discount * Q(st+1,at+1) from target network
        target_actions = self.target_act(next_states)
        target_actions = torch.cat(target_actions, dim=-1)
        target_critic_input = torch.cat((next_states_full, target_actions),
                                        dim=1).to(device)

        with torch.no_grad():
            q_next = agent.target_critic(target_critic_input)

        y = rewards[agent_number].view(-1, 1) + \
            self.discount_factor * q_next * \
            (1 - dones[agent_number].view(-1, 1))
        critic_input = torch.cat((states_full, actions), dim=1).to(device)
        q = agent.critic(critic_input)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(q, y.detach())
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5)
        agent.critic_optimizer.step()

        # -------------------------- update actor -------------------------- #
        agent.actor_optimizer.zero_grad()
        # make input to agent
        # detach the other agents to save computation
        # saves some time for computing derivative
        q_input = [ self.maddpg_agent[i].actor(state) if i == agent_number \
                   else self.maddpg_agent[i].actor(state).detach()
                   for i, state in enumerate(states) ]

        q_input = torch.cat(q_input, dim=1)
        # combine all the actions and observations for input to critic
        q_input2 = torch.cat((states_full, q_input), dim=1)

        # get the policy gradient
        actor_loss = -agent.critic(q_input2).mean()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(agent.actor.parameters(),0.5)
        agent.actor_optimizer.step()

        # for TensorBoard
        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()
        logger.add_scalars('agent%i/losses' % agent_number, {
            'critic loss': cl,
            'actor_loss': al
        }, self.iter)
    def update(self, samples, agent_number, logger):
        """update the critics and actors of all the agents """

        # --- Experiences ---
        states = torch.from_numpy(
            np.stack(
                transpose_list([e.state for e in samples
                                if e is not None]))).float().to(self.device)
        actions = torch.from_numpy(
            np.stack(
                transpose_list([e.action for e in samples
                                if e is not None]))).float().to(self.device)
        # rewards = torch.from_numpy(np.vstack([max(e.reward) for e in samples if e is not None])).float().to(self.device).t()[0]  # Use Max
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in samples
                       if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(
            np.stack(
                transpose_list([
                    e.next_state for e in samples if e is not None
                ]))).float().to(self.device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in samples if e is not None
                       ]).astype(np.uint8)).float().to(self.device)

        # --- Agent --------------------#
        agent = self.maddpg_agent[agent_number]

        # ------- Update Critic ------------------------------------------#
        agent.critic_optimizer.zero_grad()

        states_flat = tensor_flatten(states,
                                     self.state_size * self.num_agents)[0]
        actions_flat = tensor_flatten(actions,
                                      self.action_size * self.num_agents)[0]

        next_states_flat = tensor_flatten(next_states,
                                          self.state_size * self.num_agents)[0]
        target_next_actions_flat = torch.cat(self.target_actor(next_states),
                                             dim=1)

        with torch.no_grad():
            target_next_q = agent.target_critic(
                next_states_flat, target_next_actions_flat).t()[0]

        # target_q = rewards + self.discount_factor * target_next_q * (1 - dones[:, agent_number])  # Use MAX
        target_q = rewards[:,
                           agent_number] + self.discount_factor * target_next_q * (
                               1 - dones[:, agent_number])
        local_q = agent.critic(states_flat, actions_flat).t()[0]

        critic_loss = F.mse_loss(local_q, target_q)
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 1)  #
        agent.critic_optimizer.step()

        # ------- Update Actor ------------------------------------------#
        agent.actor_optimizer.zero_grad()

        local_actions_flat = torch.cat(self.actor(states), dim=1)

        actor_loss = -agent.critic(states_flat, local_actions_flat).mean()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1)  #
        agent.actor_optimizer.step()

        # ---- Save Loss -------------------------------#
        aloss = actor_loss.cpu().detach().item()
        closs = critic_loss.cpu().detach().item()
        logger.add_scalars('agent%i/losses' % agent_number, {
            'critic loss': closs,
            'actor_loss': aloss
        }, self.update_count)
Exemple #13
0
def main():
    seeding()
    parallel_envs = 4
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    save_interval = 1000
    t = 0

    # amplitude of OU noise, which slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    """
    `env` controls three agents, two blue, one red.
    env.observation_space: [Box(14,), Box(14,), Box(14,)]
    env.action_sapce: [Box(2,), Box(2,), Box(2,)]
    Box(14,) can be broken down into 2+3*2+3*2=14
    (2) location coordinates of the target landmark
    (3*2) the three agents' positions w.r.t. the target landmark
    (3*2) the three agents' velocities w.r.t. the target landmark
    """
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        # Consult `env_wrapper.py` line 19.
        all_obs = env.reset()
        """
        `all_abs` is a list of size `parallel_envs`,
        each item in the list is another list of size two,
        first is env.observation_space: [Box(14,), Box(14,), Box(14,)],
        second is [Box(14,)], which is added to faciliate training
        https://goo.gl/Xtr6sF
        `obs` and `obs_full` are both lists of size `parallel_envs`,
        `obs` has the default observation space [Box(14,), Box(14,), Box(14,)]
        `obs_full` has the compounded observation space [Box(14,)]
        """
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for one episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of steps
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            # `actions_array` has shape (3, parallel_envs, 2)
            actions_array = torch.stack(actions).detach().numpy()
            # `actions_for_env` has shape (parallel_envs, 3, 2), because
            # input to `step` requires the first index to be `parallel_envs`
            actions_for_env = np.rollaxis(actions_array, axis=1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = \
                env.step(actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update the target network `parallel_envs`=4 times
        # after every `episode_per_update`=2*4
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            # update the local network for all agents, `a_i` refers to agent no.
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # soft update the target network towards the actual networks
            maddpg.update_targets()

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # Saves the model.
        save_dict_list = []
        if save_info:
            for i in range(3):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # Save gif files.
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Exemple #14
0
 def sample(self, batchsize):
     """sample from the buffer"""
     samples = random.sample(self.deque, batchsize)
     #print("\n samples before tran in buffer size= {} * {} * {} ".format(len(samples),len(samples[0]),len(samples[0][0])))
     # transpose list of list
     return transpose_list(samples)
Exemple #15
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Exemple #16
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Exemple #17
0
def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 60000
    episode_length = 35
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0
    scenario_name = "simple_spread_ivan"

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 0.5  # was 2, try 0.5, 0.2
    noise_reduction = 0.9999  # 0.999
    #### DECAY
    initial_noise = 0.1
    decay = 0.01

    # how many episodes before update
    # episode_per_update = UPDATE_EVERY * parallel_envs
    common_folder = time.strftime("/%m%d%y_%H%M%S")
    log_path = os.getcwd() + common_folder + "/log"
    model_dir = os.getcwd() + common_folder + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # initialize environment
    # torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK)
    # env = envs.make_env("simple_spread_ivan")

    # initialize replay buffer
    buffer = ReplayBuffer(int(BUFFER_SIZE))

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    logger = SummaryWriter(log_dir=log_path)

    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])
    # agent0_reward = []
    # agent1_reward = []
    # agent2_reward = []

    agent_info = [[[]]]  # placeholder for benchmarking info

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        '\repisode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    print('Starting iterations...')
    for episode in range(0, number_of_episodes, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, num_agents))

        all_obs = env.reset()  #

        # flip the first two indices
        # ADD FOR WITHOUT PARALLEL ENV
        # all_obs = np.expand_dims(all_obs, axis=0)

        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        # if save_info:
        # frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            # get actions
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)

            noise = max(initial_noise * decay**(episode_t / 20000), 0.001)
            # noise = max(noise*noise_reduction, 0.001)

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # environment step
            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            # ADD FOR WITHOUT PARALLEL ENV
            # next_obs, rewards, dones, info = env.step(actions_for_env)
            next_obs, rewards, dones, info = env.step(actions_for_env)

            # rewards_sum += np.mean(rewards)

            # collect experience
            transition = (obs, actions_for_env, rewards, next_obs, dones)
            buffer.push(transition)

            reward_this_episode += rewards

            # obs, obs_full = next_obs, next_obs_full
            obs = next_obs

            # increment global step counter
            t += parallel_envs

            # save gif frame
            if save_info:
                # frames.append(env.render('rgb_array'))
                tmax += 1

            # for benchmarking learned policies
            if BENCHMARK:
                for i, inf in enumerate(info):
                    agent_info[-1][i].append(inf['n'])

        # update once after every episode_per_update
        # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs:
        if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs:
            for _ in range(UPDATE_TIMES):
                for a_i in range(num_agents):
                    samples = buffer.sample(BATCH_SIZE)
                    maddpg.update(samples, a_i, logger)
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            for n in range(num_agents):
                agents_reward[n].append(reward_this_episode[i, n])
            # agent0_reward.append(reward_this_episode[i,0])
            # agent1_reward.append(reward_this_episode[i,1])
            # agent2_reward.append(reward_this_episode[i,2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)]
            avg_rewards = []
            for n in range(num_agents):
                avg_rewards.append(np.mean(agents_reward[n]))
                # agent0_reward = []
            # agent1_reward = []
            # agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            print('agent_info benchmark=', agent_info)
            for i in range(5):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #                 frames, duration=.04)

    env.close()
    logger.close()
    timer.finish()