def run_agents(n_episodes=5): env = UnityEnvironment(file_name="envs/Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = env_info.vector_observations.shape[1] action_size = brain.vector_action_space_size num_agents = env_info.vector_observations.shape[0] maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents) for i, agent in enumerate(maddpg.agents): agent.actor_local.load_state_dict(torch.load(f'models/checkpoint_actor_local_{i}.pth')) agent.critic_local.load_state_dict(torch.load(f'models/checkpoint_critic_local_{i}.pth')) for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) while True: actions = maddpg.act(states, add_noise=True) env_info = env.step(actions)[brain_name] states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards if any(dones): break print(f"Epsiode {i_episode}. Rewards of two agents: {scores}")
def test(env, model_file='best.pt', num_ep=100): rewards_total = [] dict_list = torch.load(model_file) maddpg = MADDPG() maddpg.maddpg_agent[0].actor.load_state_dict(torch.load('actor0.pt')) maddpg.maddpg_agent[1].actor.load_state_dict(torch.load('actor1.pt')) maddpg.maddpg_agent[0].critic.load_state_dict(torch.load('critic1.pt')) maddpg.maddpg_agent[1].critic.load_state_dict(torch.load('critic1.pt')) for i in range(1, num_ep + 1): # play game for 100 episodes env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) while True: actions = maddpg.act(states) # select actions env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break rewards_total.append(np.max(scores)) print('Scores from episode {}: {}'.format(i, scores)) print('Average Score over {} episodes: {}'.format(num_ep, np.mean(rewards_total)))
def main(args): set_seed(args.seed) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # initialize environment n_players = 3 env = football_env.create_environment( env_name="academy_3_vs_1_with_keeper", representation="simple115", number_of_left_players_agent_controls=n_players, stacked=False, logdir="/tmp/football", write_goal_dumps=False, write_full_episode_dumps=False, render=False) # state and action space state_space_size = env.observation_space.shape[ 1] # we are using simple115 representation action_space_size = env.action_space.nvec.tolist()[0] # 三个 players 动作空间相同 # state[98:100] 表示控制的三个球员 # model print("loading models") actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] old_actors = [ Actor(state_space_size=state_space_size, action_space_size=action_space_size) for _ in range(n_players) ] old_critics = [ Critic(state_space_size=state_space_size, action_space_size=action_space_size, n_players=n_players) for _ in range(n_players) ] for old_actor, actor in zip(old_actors, actors): old_actor.load_state_dict(actor.state_dict()) for old_critic, critic in zip(old_critics, critics): old_critic.load_state_dict(critic.state_dict()) # maddpg maddpg = MADDPG(env=env, action_list=list(range(action_space_size)), actors=actors, critics=critics, old_actors=old_actors, old_critics=old_critics, args=args, device=device) print("learn") maddpg.learn()
def play(): env = UnityEnvironment(file_name='./Tennis.app') # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) # create agent maddpg_agent = MADDPG(state_size=state_size, action_size=action_size, seed=0) # load weights for i, agent in enumerate(maddpg_agent.maddpg_agent): agent.policy_local.load_state_dict( torch.load('models/checkpoint_actor_{}.pth'.format(i))) # reverse weights so agent 1 is on the left instead # for i, agent in enumerate(reversed(maddpg_agent.maddpg_agent)): # agent.policy_local.load_state_dict(torch.load('models/checkpoint_actor_{}.pth'.format(i))) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = maddpg_agent.act( states, add_noise=False) # select an action (for each agent) env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): # exit loop if episode finished break print('Agent 0 score this episode: {}'.format(scores[0])) print('Agent 0 score this episode: {}'.format(scores[1])) env.close()
def train_agents(n_episodes=10000, t_max=1000): env = UnityEnvironment(file_name="envs/Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] seeding(seed=42) state_size = env_info.vector_observations.shape[1] action_size = brain.vector_action_space_size num_agents = env_info.vector_observations.shape[0] maddpg = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents) scores_deque = deque(maxlen=100) scores_list = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) for _ in range(t_max): actions = maddpg.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards maddpg.step(states, actions, rewards, next_states, dones) states = next_states if np.any(dones): break scores_deque.append(np.max(scores)) scores_list.append(np.max(scores)) print(f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque)}', end="") if i_episode % PRINT_EVERY == 0: print( f'\rEpisode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) if np.mean(scores_deque) >= 2.0 and len(scores_deque) >= 100: for i, agent in enumerate(maddpg.agents): torch.save(agent.actor_local.state_dict(), f'models/checkpoint_actor_local_{i}.pth') torch.save(agent.critic_local.state_dict(), f'models/checkpoint_critic_local_{i}.pth') print( f'\nSaved Model: Episode {i_episode}\tAverage Score: {np.mean(scores_deque) : .3f}' ) break return scores_list
def test_ddpg(env, episodes=10): # reset brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=False)[brain_name] # action and state size action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] print('State size:', state_size) print('Action size: ', action_size) num_agents = len(env_info.agents) print('Number of agents:', num_agents) # load MADDPG agent maddpg = MADDPG(state_size, action_size, random_seed=0) for agent in maddpg.ddpg_agents: agent.actor_local.load_state_dict( torch.load('actor_agent_' + str(agent.id) + '.pth')) agent.critic_local.load_state_dict( torch.load('critic_agent_' + str(agent.id) + '.pth')) scores = [] for n in range(episodes): # prepare for training in the current epoc env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations score = 0 dones = [False] * num_agents states = env_info.vector_observations score = 0 while not np.any(dones): actions = maddpg.act(states, add_noise=False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished states = next_states score += np.max(rewards) scores.append(score) print('Average score over {} episodes: {:.4f}'.format( episodes, np.mean(scores))) return scores
def maddpg(n_episodes = 5000): #PARAMETERS: noise = 2 batch_size = 256 update_every = 1 agent = MADDPG(discount_factor = 0.99, tau = 0.02, batch_size = batch_size) buff = ReplayBuffer(10000) for i_episode in range(n_episodes): env_info = env.reset(train_mode = True)[brain_name] state = env_info.vector_observations state = torch.from_numpy(np.array(state)).float().unsqueeze(0) score = np.zeros(num_agents) t = 0 while True: actions = agent.act(state, noise) noise *= 0.9999 actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name] next_state = env_info.vector_observations next_state = torch.from_numpy(np.array(next_state)).float().unsqueeze(0) reward = np.array(env_info.rewards).reshape(1, -1) dones = np.array(env_info.local_done).reshape(1, -1) actions_array = actions_array.reshape(1, -1) buff.push((state, actions_array, reward, next_state, dones)) if len(buff) > batch_size and t % update_every == 0: for i in range(2): samples = buff.sample(batch_size) agent.update(samples, i, noise) agent.update_targets() t += 1 score += reward[0] state = next_state if np.any(dones): break scores_window.append(np.max(score)) scores.append(np.max(score)) print('\rEpisode {}\tAverage Score: {:.3f}'.format(i_episode, np.mean(scores_window)), end = "") if i_episode % 100: for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/checkpoint_actor{}.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/checkpoint_critic{}.pth'.format(i)) if np.mean(scores_window) >= 0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode - 100, np.mean(scores_window))) for i in range(2): torch.save(agent.maddpg_agent[i].actor.state_dict(), 'bin/actor{}_finished.pth'.format(i)) torch.save(agent.maddpg_agent[i].critic.state_dict(), 'bin/critic{}_finished.pth'.format(i)) break
env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size states = env_info.vector_observations state_size = states.shape[1] import torch import pickle from maddpg import MADDPG from collections import deque import matplotlib.pyplot as plt import time, os maddpg = MADDPG(24, 2, 2, 1976) scores_max_hist = [] scores_mean_hist = [] logger = logging.getLogger(__name__) f_handle = logging.FileHandler("Log_File.txt") f_format = logging.Formatter('%(levelname)s: %(asctime)s %(message)s') f_handle.setFormatter(f_format) f_handle.setLevel(logging.INFO) logger.addHandler(f_handle) def maddpg_train(n_episodes=2500):
print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) full_action_size = num_agents * action_size full_state_size = num_agents * state_size maddpg = MADDPG(num_agents, state_size, action_size, buffer_size=0) maddpg.load(agent_id=1) for i_episode in range(10): env_info = env.reset(train_mode=False)[brain_name] i_step = 0 scores = np.zeros(num_agents) # initialize the score (for each agent) while True: actions = maddpg.act(states) env_info = env.step(actions)[brain_name] # send all actions to tne environment rewards = env_info.rewards # get reward (for each agent) next_states = env_info.vector_observations # get next state (for each agent) dones = env_info.local_done # see if episode finished scores += rewards # update the score (for each agent)
from unityagents import UnityEnvironment from maddpg import MADDPG from ddpg import ReplayBuffer import numpy as np import torch import matplotlib.pyplot as plt from collections import deque env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] agent = MADDPG(discount_factor=0.99, tau=0.02, batch_size=256) agent.maddpg_agent[0].actor.load_state_dict( torch.load('bin/actor0_finished.pth', map_location=lambda storage, loc: storage)) agent.maddpg_agent[1].actor.load_state_dict( torch.load('bin/actor1_finished.pth', map_location=lambda storage, loc: storage)) env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations state = torch.from_numpy(np.array(state)).float().unsqueeze(0) score = np.zeros(2) while True: actions = agent.act(state, 0) actions_array = torch.stack(actions).detach().numpy() env_info = env.step(actions_array)[brain_name]
from multiagent.environment import MultiAgentEnv import multiagent.scenarios as scenarios import torch import numpy as np from agent import DDPGAgent from maddpg import MADDPG from utils import MultiAgentReplayBuffer def make_env(scenario_name, benchmark=False): # load scenario from script scenario = scenarios.load(scenario_name + ".py").Scenario() # create world world = scenario.make_world() # create multiagent environment if benchmark: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) else: env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) return env env = make_env(scenario_name="simple_spread") ma_controller = MADDPG(env, 1000000) ma_controller.run(500, 300, 32)
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
totalTime = 0 vis = visdom.Visdom(port=8097) win = None param = None np.random.seed(1234) th.manual_seed(1234) n_episode = 20000 max_steps = 1200 episode_before_train = 100 obs = env.reset() n_states = len(obs[0]) maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity, episode_before_train) FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor for i_episode in range(n_episode): startTime = datetime.datetime.now() obs = env.reset() obs = np.stack(obs) if isinstance(obs, np.ndarray): obs = th.from_numpy(obs).float() #obs = np.asarray(obs) reward_record = [] adversaries_reward_record = [] agent_reward_record = [] # for i in range(len(obs)):
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()
def main(): env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] seeding() # number of parallel agents #parallel_envs = num_agents # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 update_actor_after = 100 update_actor_every = 2 episode_length = 100 batchsize = 100 # how many episodes to save policy and gif save_interval = 1000 t = 0 LR_ACTOR = 1e-5 LR_CRITIC = 3e-3 # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.999999 # how many episodes before update episode_per_update = 1 no_of_updates_perTime = 1 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) #torch.set_num_threads(parallel_envs) #env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(10 * episode_length)) # initialize policy and critic maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC) #logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] #agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) reward_this_episode = np.zeros((1, num_agents)) #all_obs = env.reset() # obs = states obs_full = np.concatenate((states[0], states[1])) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < 1 or episode == number_of_episodes - 1) tmax = 0 #resetting noise for i in range(num_agents): maddpg.maddpg_agent[i].noise.reset() for episode_t in range(episode_length): t += 1 update_act = True if (episode > update_actor_after or episode % update_actor_every == 0) else False # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensorAsitis(obs), noise=noise, batch=False) noise *= noise_reduction actions_array = torch.stack(actions).cpu().detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame env_info = env.step(actions_for_env)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards rewards_for_env = np.hstack(rewards) obs = states obs_full = np.concatenate((states[0], states[1])) next_obs = next_states next_obs_full = np.concatenate((next_states[0], next_states[1])) # add data to buffer transition = (np.array([obs]), np.array([obs_full]), np.array([actions_for_env]), np.array([rewards_for_env]), np.array([next_obs]), np.array([next_obs_full]), np.array([dones], dtype='float')) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for _ in range(no_of_updates_perTime): for a_i in range(num_agents): samples = buffer.sample(batchsize) #updating the weights of the n/w maddpg.update(samples, a_i, update_actor=update_act) maddpg.update_targets( ) #soft update the target network towards the actual networks if np.any(dones): # if the episode is done the loop is break to the next episode break for i in range(num_agents): agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)] agent0_reward = [] agent1_reward = [] for a_i, avg_rew in enumerate(avg_rewards): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), #frames, duration=.04) timer.finish()
print('The state for the first agent looks like:', states[0]) # config settings config = Config() config.update_every = 1 config.batch_size = 512 config.buffer_size = int(1e6) config.discount = 0.99 config.tau = 0.2 config.seed = 2 config.lr_actor = 1e-4 config.lr_critic = 1e-4 config.action_size = action_size config.state_size = state_size config.num_agents = num_agents ma = MADDPG(config) def train(n_episode=30000): """ Function to train the agent """ scores = [] scores_window = deque(maxlen=100) for i_episode in range(n_episode): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations ma.reset() score = np.zeros(num_agents) while True: actions = ma.act(states)
def main(arglist): ACTORS = 1 env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode) if arglist.eval: current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) writer = SummaryWriter(log_dir='./logs/' + current_time + '-' + arglist.scenario) maddpg_wrapper = MADDPG(ACTORS) maddpg_wrapper.create_agents(env, arglist) j = 0 for episode in range(arglist.max_episode): obs = env.reset() terminal = False maddpg_wrapper.reset() total_reward = [0 for i in maddpg_wrapper.workers] step = 0 while not terminal and step < 25: if not arglist.eval: env.render(0) time.sleep(0.03) actions = maddpg_wrapper.take_actions(obs) obs2, reward, done = env.step(actions) for actor in range(ACTORS): for i, rew in enumerate(reward[actor]): total_reward[i] += rew j += ACTORS #terminal = all(done) if arglist.eval: maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2, done) obs = obs2 step += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: maddpg_wrapper.save(episode) if arglist.eval: for worker, ep_ave_max in zip(maddpg_wrapper.workers, maddpg_wrapper.ep_ave_max_q_value): print(worker.pos, ' => average_max_q: ', ep_ave_max / float(step), ' Reward: ', total_reward[worker.pos], ' Episode: ', episode) writer.add_scalar( str(worker.pos) + '/Average_max_q', ep_ave_max / float(step), episode) writer.add_scalar( str(worker.pos) + '/Reward Agent', total_reward[worker.pos], episode) env.close()
# ############################################################################## # ENVIRONMENT # ############################################################################## env = UnityEnvironment(file_name=ENV_FILE, seed=SEED) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # ############################################################################## # AGENT # ############################################################################## # INITIALIZE AGENT, AND LOAD WEIGHTS FROM BEST SNAPSHOT maddpg = MADDPG( actor_layer_sizes=ACTOR_LAYER_SIZES, critic_layer_sizes=CRITIC_LAYER_SIZES, clamp_actions=CLAMP_ACTIONS, logger=None, ) maddpg.load_model(os.path.join(snapshots_dir, "best_model.snapshot")) # ############################################################################## # INTERACT WITH ENVIRONMENT # ############################################################################## for episode_i in range(1, N_EPISODES + 1): print("{dec}\nEpisode {i}\n{dec}\n".format(dec="=" * 60, i=episode_i)) # INITIALIZE FOR NEW EPISODE rewards_this_episode = np.zeros((N_AGENTS, )) env_info = env.reset(train_mode=False)[brain_name] states = process_agent_states(env_info.vector_observations) global_state = process_gobal_state(env_info.vector_observations)
def update(): if ALGORITHM == 'maddpg': ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model', RETRAIN) elif ALGORITHM == 'ddpg': ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) else: ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) t1 = time.time() rewards1 = 0 rewards2 = 0 var = VAR collision = 0 avgreward1 = [] avgreward2 = [] collision_percentage = [] for i in range(MAX_EPISODES): s1, s2 = avs.reset() ep_reward1 = 0 ep_reward2 = 0 if i % 100000 == 0 and i > IMITATION_EPISODE: plot(avgreward1, avgreward2, collision_percentage, i) for j in range(MAX_EP_STEPS): if RENDER: avs.render() # Add exploration noise if i < IMITATION_EPISODE or i % 4 == 0: a1 = imitation(avs.agent1, avs.agent2, avs.target1) a2 = imitation(avs.agent2, avs.agent1, avs.target2) else: # add randomness to action selection for exploration a1 = ddpg.choose_action(s1) a1 = [ np.clip(np.random.normal(a1[0], var), -1, 1), np.clip(np.random.normal(a1[1], var), -1, 1) ] a2 = ddpg.choose_action(s2) a2 = [ np.clip(np.random.normal(a2[0], var), -1, 1), np.clip(np.random.normal(a2[1], var), -1, 1) ] # a2 = imitation(avs.agent2, avs.agent1, avs.target2) if DEBUG: time.sleep(0.1) s_1, r1, s_2, r2, done, info = avs.step(a1, a2) if ALGORITHM == 'ddpg': ddpg.store_transition(s1, a1, r1, s_1) ddpg.store_transition(s2, a2, r2, s_2) else: ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2) ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1) s1 = s_1 s2 = s_2 ep_reward1 += r1 ep_reward2 += r2 if j == MAX_EP_STEPS - 1 or done: print("pt:", ddpg.pointer) print('Episode:', i, 'Step:', j, ' Reward: %i' % int(ep_reward1), int(ep_reward2), 'Explore: %.2f' % var) if i >= IMITATION_EPISODE: rewards1 += ep_reward1 rewards2 += ep_reward2 if r1 < -100: collision += 1 if (i + 1) % 100 == 0: avgreward1.append(rewards1 / 100) avgreward2.append(rewards2 / 100) collision_percentage.append(collision) rewards1 = 0 rewards2 = 0 collision = 0 break if ddpg.pointer > MEMORY_CAPACITY: ddpg.learn() ddpg.learn() if var > MIN_VAR and i > IMITATION_EPISODE: var *= DECAY # decay the action randomness if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE: ddpg.save(i) print('Running time: ', time.time() - t1)
# size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) from maddpg import MADDPG from collections import deque import torch agent = MADDPG(24, 2, 0) env_info = env.reset(train_mode=True)[brain_name] env_info.vector_observations.shape def maddpg(max_episodes=2000, print_every=10): scores_deque = deque(maxlen=100) scores = [] for i_episode in range(1, max_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = 0 while True: actions = agent.act(states)
print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) full_action_size = num_agents * action_size full_state_size = num_agents * state_size writer = SummaryWriter(log_dir="logs/train", flush_secs=30) maddpg = MADDPG(num_agents, state_size, action_size, buffer_size=BUFFER_SIZE, writer=writer) scores_list, avg_scores_list = maddpg_training(maddpg, writer) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(scores_list) + 1), scores_list) plt.plot(np.arange(1, len(avg_scores_list) + 1), avg_scores_list) plt.ylabel('Score') plt.xlabel('Episode #') plt.show() plt.savefig("Scores.png") writer.close()
print( 'Episode {}\tAverage Score: {:.3f} MaxReward: {:.3f} Buffer : {}/{} Noise: {:.3f} Timestep: {}.' .format(episode, avg_score, max_reward, len(agent.memory), BUFFER_SIZE, agent.epsilon, agent.timestep_counter)) if avg_score >= GOAL: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode, avg_score)) agent.checkpoint() break return global_scores, averaged_scores # Init the Tennis environment and get agents, state and action info env, brain_name, n_agents, state_size, action_size = init_environment( UNITY_EXE_PATH) agent = MADDPG(state_size=state_size, action_size=action_size, n_agents=n_agents, random_seed=89) # Train the agent and get the results scores, averages = train() # Plot Statistics (Global scores and averaged scores) plt.subplot(2, 1, 2) plt.plot(np.arange(1, len(scores) + 1), averages) plt.ylabel('Tennis Environment Average Score') plt.xlabel('Episode #') plt.show()
print('\nExample state for a single agent:\n', states[0]) agent_state_size = process_agent_states(states).shape[1] global_state_size = process_gobal_state(states).shape[0] # ############################################################################## # AGENT # ############################################################################## # Create Multi agent Actor-Critic Model maddpg = MADDPG( actor_layer_sizes=ACTOR_LAYER_SIZES, critic_layer_sizes=CRITIC_LAYER_SIZES, discount_factor=DISCOUNT_FACTOR, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gradient_clipping=GRADIENT_CLIPPING, clamp_actions=CLAMP_ACTIONS, logger=logger, log_losses=True, log_layers=False, log_weights=False, ) # ############################################################################## # TRAIN # ############################################################################## buffer = ReplayBuffer(int(BUFFER_SIZE), seed=SEED) n_episodes = 10000 best_rolling_mean_score = -np.inf hard_noise_reigime = True
if __name__ == "__main__": # Configuration n_episodes = 1 checkpoint = "./checkpoints/checkpoint{}.pth" # Unitiy environment env = UnityEnvironment("./Tennis_Linux/Tennis.x86_64") # Agent agent = TennisMultiAgent(state_size=24, action_size=2, n_agents=2) agent.load(checkpoint) # DDPG maddpg = MADDPG(env=env, agent=agent) scores = maddpg.test(n_episodes=n_episodes) # Close the environment env.close() if n_episodes > 1: # Show results print(scores) print("Average score of {} episodes: {:.2f}".format( n_episodes, np.mean(scores))) # Plot scores fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(np.linspace(1, n_episodes + 1, n_episodes), scores) ax.set_xlabel("Episodes")
num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) print('The state for the first agent looks like:', states[0]) agents = MADDPG(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=2) agents.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) agents.critic_local.load_state_dict(torch.load('checkpoint_critic.pth')) def play(n_episodes=5): for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations agents.reset() scores = np.zeros(num_agents) while True: actions = agents.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations
max_steps = 100 # before training, we will store the experience of all agents' state information for the next training process. episode_before_train = 100 obs = env.reset() n_states = len(obs[0]) initial_train = True test_or_train = True #vis = visdom.Visdom(port=8097) win = None param = None np.random.seed(1234) th.manual_seed(1234) # the initial of the original maddpg, it is the basic part of our architecture. maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, capacity, episode_before_train, initial_train, test_or_train) FloatTensor = th.cuda.FloatTensor if maddpg.use_cuda else th.FloatTensor for i in range(maddpg.n_agents): maddpg.critics[i] = th.load('new/model_initial/critic[' + str(i) + '].pkl_episode' + str(3000)) maddpg.actors[i] = th.load('new/model_initial/actors[' + str(i) + '].pkl_episode' + str(3000)) for i_episode in range(n_episode): startTime = datetime.datetime.now() obs = env.reset() obs = np.stack(obs) if isinstance(obs, np.ndarray): obs = th.from_numpy(obs).float()
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + '/log' model_dir = os.getcwd() + '/model_dir' os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir = log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array'))
'N_EPS_MIN': .01, # Normal noise min decay value 'OU_THETA': 1e-2, # OU noise theta parameter 'OU_SIGMA': 1e-2, # OU noise sigma parameters 'SEED': 42, # Random seed 'DEVICE': torch.device("cuda" if torch.cuda.is_available() else "cpu"), # Training Hyperparameters 'N_EPISODES': 2000, 'MAX_T': 2000, 'SUCCESS_SCORE': .5, 'PRINT_EVERY': 100, # Save on W&B 'WANDB': True, } if PARAMETERS['WANDB']: # Save on wandb wandb.init(project="maddpg", config=PARAMETERS) # Agent agent = MADDPG(PARAMETERS) # Training job scores = train_MADDPG(env, agent, n_episodes=PARAMETERS['N_EPISODES'], max_t=PARAMETERS['MAX_T'], success_score=PARAMETERS['SUCCESS_SCORE'], print_every=PARAMETERS['PRINT_EVERY'], brain_name=brain_name, use_wandb=PARAMETERS['WANDB'])
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 3000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor print_every = 100 # how many episodes before update episode_per_update = 2 #model_dir= os.getcwd()+"/model_dir" #os.makedirs(model_dir, exist_ok=True) result_dir= os.getcwd()+"/result_dir" os.makedirs(result_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents*2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) num_agents, num_states, num_actions = env.get_shapes() # initialize policy and critic maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) states, states_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() while True: actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise) noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_states, next_states_full, rewards, dones, info = env.step(actions_for_env) # add data to buffer buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones) reward_this_episode += rewards states = np.copy(next_states) states_full = np.copy(next_states_full) # update once after every episode_per_update if len(buffer) > batchsize: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) save_dict_list =[] if episode % print_every == 0.0 or avg_rewards > 2.5: print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise)) if avg_rewards > 2.5: for i in range(number_of_agents): save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score))) print('model saved') break env.close() #print('main-ep_scores: ', ep_scores) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(ep_scores)+1), ep_scores) plt.ylabel('Score') plt.xlabel('Episode #') fig.savefig(result_dir + '/score_plot.png')