def maddpg(n_episodes = 5000): #PARAMETERS: noise = 2 batch_size = 256 update_every = 1 agent = MADDPG(discount_factor = 0.99, tau = 0.02, batch_size = batch_size) buff = ReplayBuffer(10000) for i_episode in range(n_episodes): env_info = env.reset(train_mode = True)[brain_name] state = env_info.vector_observations state = torch.from_numpy(np.array(state)).float().unsqueeze(0) score = np.zeros(num_agents) t = 0 while True: actions = agent.act(state, noise) noise *= 0.9999 actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations next_state = torch.from_numpy(np.array(next_state)).float().unsqueeze(0) reward = np.array(env_info.rewards).reshape(1, -1) dones = np.array(env_info.local_done).reshape(1, -1) actions_array = actions_array.reshape(1, -1) buff.push((state, actions_array, reward, next_state, dones)) if len(buff) > batch_size and t % update_every == 0: for i in range(2): samples = buff.sample(batch_size) agent.update(samples, i, noise) agent.update_targets() t += 1 score += reward[0] state = next_state if np.any(dones): break scores_window.append(np.max(score)) scores.append(np.max(score)) print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)), end = "") if i_episode % 100: for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/checkpoint_actor{}.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/checkpoint_critic{}.pth'.format(i)) if np.mean(scores_window) >= 0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode - 100, np.mean(scores_window))) for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/actor{}_finished.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/critic{}_finished.pth'.format(i)) break
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
if any(dones): break if episode_i > HARD_NOISE_STEPS: hard_noise_reigime = False # POTENTIALLY START TAKING SAMPLES TO TRAIN FROM EXPERIENCE BUFFER if len(buffer) > MIN_BUFFER_SIZE: update_flag = "u" for _ in range(N_BATCHES_PER_UPDATE): for agent_i in range(N_AGENTS): # samples = buffer.sample(3) samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, agent_i) if UPDATE_TARGET_AFTER_EACH_BATCH: maddpg.update_targets() if not UPDATE_TARGET_AFTER_EACH_BATCH: maddpg.update_targets() else: update_flag = " " # UPDATE EPISODE AND ROLLING MEAN SCORES agg_reward_this_episode = np.max(rewards_this_episode) rewards_deque.append(agg_reward_this_episode) rolling_mean_reward = np.mean(rewards_deque) history.append(agg_reward_this_episode) history_rolling_mean.append(rolling_mean_reward) # MONITOR PROGRESS IN TENSORBOARD if logger is not None:
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()
def train(env, model_path='model_dir', number_of_episodes=50000, episode_length=500): noise = 1.0 noise_reduction = 1.0 batchsize = 256 model_dir = os.getcwd() + "/" + model_path model_files = glob.glob(model_dir + "/*.pt") for file in model_files: os.remove(file) os.makedirs(model_dir, exist_ok=True) buffer = ReplayBuffer(int(1e5)) rewards_deque = deque(maxlen=100) rewards_total = [] # initialize policy and critic maddpg = MADDPG() for episode in range(1, number_of_episodes + 1): rewards_this_episode = np.asarray([0.0, 0.0]) env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations for episode_t in range(episode_length): actions = maddpg.act(obs, noise=noise) noise *= noise_reduction env_info = env.step(actions)[brain_name] next_obs = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (obs, actions, rewards, next_obs, dones) buffer.push(transition) rewards_this_episode += rewards obs = next_obs if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize * 4: for _ in range(4): for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) # soft update the target network towards the actual networks rewards_total.append(np.max(rewards_this_episode)) rewards_deque.append(rewards_total[-1]) average_score = np.mean(rewards_deque) print(episode, rewards_this_episode, rewards_total[-1], average_score) # saving model save_dict_list = [] if episode % 1000 == 0: for i in range(2): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) torch.save(maddpg.maddpg_agent[0].actor.state_dict(), 'actor0.pt') torch.save(maddpg.maddpg_agent[1].actor.state_dict(), 'actor1.pt') torch.save(maddpg.maddpg_agent[0].critic.state_dict(), 'critic0.pt') torch.save(maddpg.maddpg_agent[1].critic.state_dict(), 'critic1.pt') return rewards_total
def main(): env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] seeding() # number of parallel agents #parallel_envs = num_agents # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 update_actor_after = 100 update_actor_every = 2 episode_length = 100 batchsize = 100 # how many episodes to save policy and gif save_interval = 1000 t = 0 LR_ACTOR = 1e-5 LR_CRITIC = 3e-3 # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.999999 # how many episodes before update episode_per_update = 1 no_of_updates_perTime = 1 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) #torch.set_num_threads(parallel_envs) #env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(10 * episode_length)) # initialize policy and critic maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC) #logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] #agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) reward_this_episode = np.zeros((1, num_agents)) #all_obs = env.reset() # obs = states obs_full = np.concatenate((states[0], states[1])) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < 1 or episode == number_of_episodes - 1) tmax = 0 #resetting noise for i in range(num_agents): maddpg.maddpg_agent[i].noise.reset() for episode_t in range(episode_length): t += 1 update_act = True if (episode > update_actor_after or episode % update_actor_every == 0) else False # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensorAsitis(obs), noise=noise, batch=False) noise *= noise_reduction actions_array = torch.stack(actions).cpu().detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame env_info = env.step(actions_for_env)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards rewards_for_env = np.hstack(rewards) obs = states obs_full = np.concatenate((states[0], states[1])) next_obs = next_states next_obs_full = np.concatenate((next_states[0], next_states[1])) # add data to buffer transition = (np.array([obs]), np.array([obs_full]), np.array([actions_for_env]), np.array([rewards_for_env]), np.array([next_obs]), np.array([next_obs_full]), np.array([dones], dtype='float')) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for _ in range(no_of_updates_perTime): for a_i in range(num_agents): samples = buffer.sample(batchsize) #updating the weights of the n/w maddpg.update(samples, a_i, update_actor=update_act) maddpg.update_targets( ) #soft update the target network towards the actual networks if np.any(dones): # if the episode is done the loop is break to the next episode break for i in range(num_agents): agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)] agent0_reward = [] agent1_reward = [] for a_i, avg_rew in enumerate(avg_rewards): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), #frames, duration=.04) timer.finish()
def main(): ########## # CONFIG # ########## # Target Reward tgt_score = 0.5 # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Seed seed = 7 seeding(seed) # Model Architecture # Actor hidden_in_actor = 256 hidden_out_actor = 128 lr_actor = 1e-4 # Critic hidden_in_critic = 256 hidden_out_critic = 128 lr_critic = 3e-4 weight_decay_critic = 0 # Episodes number_of_episodes = 10000 episode_length = 2000 # Buffer buffer_size = int(1e6) batchsize = 512 # Agent Update Frequency episode_per_update = 1 # Rewards Discounts Factor discount_factor = 0.95 # Soft Update Weight tau = 1e-2 # Noise Process noise_factor = 2 noise_reduction = 0.9999 noise_floor = 0.0 # Window win_len = 100 # Save Frequency save_interval = 200 # Logger log_path = os.getcwd() + "/log" logger = SummaryWriter(log_dir=log_path) # Model Directory model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # Load Saved Model load_model = False #################### # Load Environment # #################### env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64") # Get brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print('Brain Name:', brain_name) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of Agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) #################### # Show Progressbar # #################### widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() start = time.time() ############### # Multi Agent # ############### maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor, hidden_out_actor, lr_actor, hidden_in_critic, hidden_out_critic, lr_critic, weight_decay_critic, discount_factor, tau, seed, device) if load_model: load_dict_list = torch.load(os.path.join(model_dir, 'episode-saved.pt')) for i in range(num_agents): maddpg.maddpg_agent[i].actor.load_state_dict( load_dict_list[i]['actor_params']) maddpg.maddpg_agent[i].actor_optimizer.load_state_dict( load_dict_list[i]['actor_optim_params']) maddpg.maddpg_agent[i].critic.load_state_dict( load_dict_list[i]['critic_params']) maddpg.maddpg_agent[i].critic_optimizer.load_state_dict( load_dict_list[i]['critic_optim_params']) ################# # Replay Buffer # ################# rebuffer = ReplayBuffer(buffer_size, seed, device) ################# # TRAINING LOOP # ################# # initialize scores scores_history = [] scores_window = deque(maxlen=save_interval) # i_episode = 0 for i_episode in range(number_of_episodes): timer.update(i_episode) # Reset Environmet env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) # Reset Agent maddpg.reset() # episode_t = 0 for episode_t in range(episode_length): # Explore with decaying noise factor actions = maddpg.act(states, noise_factor=noise_factor) env_info = env.step(actions)[brain_name] # Environment reacts next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished ################### # Save Experience # ################### rebuffer.add(states, actions, rewards, next_states, dones) scores += rewards states = next_states if any(dones): break scores_history.append(np.max(scores)) # save most recent score scores_window.append(np.max(scores)) # save most recent score avg_rewards = np.mean(scores_window) noise_factor = max(noise_floor, noise_factor * noise_reduction) # Reduce Noise Factor ######### # LEARN # ######### if len(rebuffer) > batchsize and i_episode % episode_per_update == 0: for a_i in range(num_agents): samples = rebuffer.sample(batchsize) maddpg.update(samples, a_i, logger) # Soft Update maddpg.update_targets() ################## # Track Progress # ################## if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") ############## # Save Model # ############## save_info = ((i_episode) % save_interval == 0 or i_episode == number_of_episodes) if save_info: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-Latest.pt')) pd.Series(scores_history).to_csv( os.path.join(model_dir, "scores.csv")) # plot the scores rolling_mean = pd.Series(scores_history).rolling(win_len).mean() fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_history)), scores_history) plt.axhline(y=tgt_score, color='r', linestyle='dashed') plt.plot(rolling_mean, lw=3) plt.ylabel('Score') plt.xlabel('Episode #') # plt.show() fig.savefig(os.path.join(model_dir, 'Average_Score.pdf')) fig.savefig(os.path.join(model_dir, 'Average_Score.jpg')) plt.close() if avg_rewards > tgt_score: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") break env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents env = UnityEnvironment(file_name="Tennis.x86_64") env_name = 'Tennis' # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[-1] # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 10000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize memory buffer buffer = ReplayBuffer(int(500000), batchsize, 0) # initialize policy and critic maddpg = MADDPG(state_size, action_size, num_agents, seed=12345, discount_factor=0.95, tau=0.02) #how often to update the MADDPG model episode_per_update = 2 # training loop PRINT_EVERY = 5 scores_deque = deque(maxlen=100) # holds raw scores scores = [] # holds avg scores of last 100 epsiodes avg_last_100 = [] threshold = 0.5 # use keep_awake to keep workspace from disconnecting for episode in range(number_of_episodes): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations # get the current state (for each agent) episode_reward_agent0 = 0 episode_reward_agent1 = 0 for agent in maddpg.maddpg_agent: agent.noise.reset() for episode_t in range(episode_length): actions = maddpg.act(torch.tensor(state, dtype=torch.float), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done episode_reward_agent0 += reward[0] episode_reward_agent1 += reward[1] # add data to buffer ''' I can either hstack or concat two states here or do it in the update function in MADDPG However I think it's easier to do it here, since in the update function I have batch_size to deal with Although the replay buffer would have to hold more data by preprocessing and creating 2 new variables that hold essentially the same info as state, and next_state, but just concatenated. ''' full_state = np.concatenate((state[0], state[1])) full_next_state = np.concatenate((next_state[0], next_state[1])) buffer.add(state, full_state, actions_array, reward, next_state, full_next_state, done) state = next_state # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for i in range(num_agents): samples = buffer.sample() maddpg.update(samples, i) maddpg.update_targets( ) # soft update the target network towards the actual networks if np.any(done): #if any of the agents are done break break episode_reward = max(episode_reward_agent0, episode_reward_agent1) scores.append(episode_reward) scores_deque.append(episode_reward) avg_last_100.append(np.mean(scores_deque)) # scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.4f}\tScore: {:.4f}'.format( episode, avg_last_100[-1], episode_reward), end="") if episode % PRINT_EVERY == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format( episode, avg_last_100[-1])) # saving successful model #training ends when the threshold value is reached. if avg_last_100[-1] >= threshold: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # plots graphs raw_score_plotter(scores) plotter(env_name, len(scores), avg_last_100, threshold) break
def main(): seeding() parallel_envs = 4 number_of_episodes = 1000 episode_length = 80 batchsize = 1000 save_interval = 1000 t = 0 # amplitude of OU noise, which slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) """ `env` controls three agents, two blue, one red. env.observation_space: [Box(14,), Box(14,), Box(14,)] env.action_sapce: [Box(2,), Box(2,), Box(2,)] Box(14,) can be broken down into 2+3*2+3*2=14 (2) location coordinates of the target landmark (3*2) the three agents' positions w.r.t. the target landmark (3*2) the three agents' velocities w.r.t. the target landmark """ env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) # Consult `env_wrapper.py` line 19. all_obs = env.reset() """ `all_abs` is a list of size `parallel_envs`, each item in the list is another list of size two, first is env.observation_space: [Box(14,), Box(14,), Box(14,)], second is [Box(14,)], which is added to faciliate training https://goo.gl/Xtr6sF `obs` and `obs_full` are both lists of size `parallel_envs`, `obs` has the default observation space [Box(14,), Box(14,), Box(14,)] `obs_full` has the compounded observation space [Box(14,)] """ obs, obs_full = transpose_list(all_obs) # for calculating rewards for one episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of steps # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # `actions_array` has shape (3, parallel_envs, 2) actions_array = torch.stack(actions).detach().numpy() # `actions_for_env` has shape (parallel_envs, 3, 2), because # input to `step` requires the first index to be `parallel_envs` actions_for_env = np.rollaxis(actions_array, axis=1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = \ env.step(actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update the target network `parallel_envs`=4 times # after every `episode_per_update`=2*4 if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: # update the local network for all agents, `a_i` refers to agent no. for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network towards the actual networks maddpg.update_targets() for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # Saves the model. save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # Save gif files. imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of training episodes. number_of_episodes = 5000 episode_length = 1000 batchsize = 2000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe') env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe', no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) replay_episodes = 1000 buffer = ReplayBuffer(int(replay_episodes * episode_length)) # initialize policy and critic maddpg = MADDPG() # logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] # training loop scores_deque = deque(maxlen=100) scores = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros(num_agents) env_info = env.reset(True)[brain_name] state = env_info.vector_observations obs = [[state[0], state[1]]] obs_full = np.concatenate((state[0], state[1])) #for calculating rewards for this particular episode - addition of all time steps frames = [] tmax = 0 for episode_t in range(episode_length): t += 1 # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # actions_for_env = np.rollaxis(actions_array,1) actions_for_env = np.clip(actions_array.flatten(), -1, 1) # print(actions_for_env) # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) env_info = env.step(actions_for_env)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done next_obs = [[next_state[0], next_state[1]]] next_obs_full = np.concatenate((next_state[0], next_state[1])) # print(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones # add data to buffer transition = ([obs], [obs_full], [actions_for_env], [rewards], [next_obs], [next_obs_full], [dones]) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) #soft update the target network towards the actual networks avg_rewards = np.mean(reward_this_episode, axis=0) episode_reward = np.max(avg_rewards) scores_deque.append(episode_reward) scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward), end="") if (episode > 0 and episode % 100 == 0) or episode == number_of_episodes - 1: print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward)) if np.mean(scores_deque) >= 0.5: print('\nSuccess!') break #saving model save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) env.close() fig = plt.figure() ax = fig.add_subplot(111) plt.ylabel('Score') plt.xlabel('Episode #') plt.plot(np.arange(1, len(scores) + 1), scores) plt.savefig('tennis_score_history.png') return scores
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 60000 episode_length = 35 # how many episodes to save policy and gif save_interval = 1000 t = 0 scenario_name = "simple_spread_ivan" # amplitude of OU noise # this slowly decreases to 0 noise = 0.5 # was 2, try 0.5, 0.2 noise_reduction = 0.9999 # 0.999 #### DECAY initial_noise = 0.1 decay = 0.01 # how many episodes before update # episode_per_update = UPDATE_EVERY * parallel_envs common_folder = time.strftime("/%m%d%y_%H%M%S") log_path = os.getcwd() + common_folder + "/log" model_dir = os.getcwd() + common_folder + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize environment # torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK) # env = envs.make_env("simple_spread_ivan") # initialize replay buffer buffer = ReplayBuffer(int(BUFFER_SIZE)) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) logger = SummaryWriter(log_dir=log_path) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] agent_info = [[[]]] # placeholder for benchmarking info # training loop # show progressbar import progressbar as pb widget = [ '\repisode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() print('Starting iterations...') for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, num_agents)) all_obs = env.reset() # # flip the first two indices # ADD FOR WITHOUT PARALLEL ENV # all_obs = np.expand_dims(all_obs, axis=0) obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 # if save_info: # frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # get actions # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise = max(initial_noise * decay**(episode_t / 20000), 0.001) # noise = max(noise*noise_reduction, 0.001) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # environment step # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) # ADD FOR WITHOUT PARALLEL ENV # next_obs, rewards, dones, info = env.step(actions_for_env) next_obs, rewards, dones, info = env.step(actions_for_env) # rewards_sum += np.mean(rewards) # collect experience transition = (obs, actions_for_env, rewards, next_obs, dones) buffer.push(transition) reward_this_episode += rewards # obs, obs_full = next_obs, next_obs_full obs = next_obs # increment global step counter t += parallel_envs # save gif frame if save_info: # frames.append(env.render('rgb_array')) tmax += 1 # for benchmarking learned policies if BENCHMARK: for i, inf in enumerate(info): agent_info[-1][i].append(inf['n']) # update once after every episode_per_update # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs: if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs: for _ in range(UPDATE_TIMES): for a_i in range(num_agents): samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): for n in range(num_agents): agents_reward[n].append(reward_this_episode[i, n]) # agent0_reward.append(reward_this_episode[i,0]) # agent1_reward.append(reward_this_episode[i,1]) # agent2_reward.append(reward_this_episode[i,2]) if episode % 100 == 0 or episode == number_of_episodes - 1: # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)] avg_rewards = [] for n in range(num_agents): avg_rewards.append(np.mean(agents_reward[n])) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: print('agent_info benchmark=', agent_info) for i in range(5): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), # frames, duration=.04) env.close() logger.close() timer.finish()