def main_single_agent(): env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", worker_id=1, seed=1) env_date = str(datetime.datetime.now()) file_path = os.path.join('data_single', env_date) os.makedirs(file_path, exist_ok=True) save_config(file_path) brain_name = env.brain_names[0] buffer = ReplayBuffer(Config.buffer_size) agent = DDPGAgent(in_actor=48, hidden_in_actor=Config.actor_hidden[0], hidden_out_actor=Config.actor_hidden[1], out_actor=2, in_critic=50, hidden_in_critic=Config.critic_hidden[0], hidden_out_critic=Config.critic_hidden[1], lr_actor=Config.actor_lr, lr_critic=Config.critic_lr, noise_dist=Config.noise_distribution, checkpoint_path=Config.checkpoint_path) agent_reward, all_rewards_mean = [], [] batchsize = Config.batchsize max_reward = Config.max_reward # amplitude of OU noise # this slowly decreases to 0 noise = Config.noise_beginning logger = logging.getLogger('Tennis MADDPG') all_rewards = [] for episode in range(Config.n_episodes): reward_this_episode = 0 env_info = env.reset(train_mode=True)[brain_name] states = torch.from_numpy(np.concatenate(env_info.vector_observations) ) # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) n_of_steps = 0 noise = max( Config.min_noise, Config.noise_beginning * (1 - (Config.n_episodes - episode) / Config.n_episodes)) while True: n_of_steps += 1 states_tensor = torch.tensor(states).float() actions = agent.act(states_tensor, noise=noise) actions_array = actions.detach().numpy() actions_for_env = np.clip(actions_array, -1, 1) # all actions between -1 and 1 env_info = env.step(np.array([ actions_for_env, actions_for_env ]))[brain_name] # send all actions to tne environment states_next = torch.from_numpy( np.concatenate(env_info.vector_observations)) # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward reward = np.sum(np.array(env_info.rewards)) reward_this_episode += reward if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) if not Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) dones = env_info.local_done # see if episode finished scores += np.sum( env_info.rewards) # update the score (for each agent) states = states_next # roll over states to next time step if np.any(dones): # exit loop if episode finished break all_rewards.append(reward_this_episode) all_rewards_mean.append(np.mean(all_rewards[-100:])) if len(buffer) > Config.warmup: agent.update(buffer, batchsize=batchsize, tau=Config.tau, discount=Config.discount_factor) if episode % Config.update_episode_n == 0: agent.update_targets(tau=Config.tau) if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1: logger.info( f'Episode {episode}: Average reward over 100 episodes is {all_rewards_mean[-1]}' ) if all_rewards_mean and all_rewards_mean[-1] > max_reward: logger.info('Found best model. Saving model into file: ...') save_dict_list = [] save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } save_dict_list.append(save_dict) save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(file_path, 'episode-{}.pt'.format(episode))) max_reward = all_rewards_mean[-1] plt.plot(all_rewards_mean) plt.xlabel('N of episodes') plt.ylabel('Reward') plt.title( 'Final rewards of single agent for tennis collaboration task') plt.savefig(os.path.join(file_path, 'result_plot.png')) save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_target_params': agent.target_actor.save_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_target_params': agent.target_critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } torch.save(save_dict, os.path.join(file_path, 'episode-{}.pt'.format(episode)))
def main(): seeding(seed=SEED) # number of parallel agents parallel_envs = 1 # number of agents per environment num_agents = 5 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 60000 episode_length = 35 # how many episodes to save policy and gif save_interval = 1000 t = 0 scenario_name = "simple_spread_ivan" # amplitude of OU noise # this slowly decreases to 0 noise = 0.5 # was 2, try 0.5, 0.2 noise_reduction = 0.9999 # 0.999 #### DECAY initial_noise = 0.1 decay = 0.01 # how many episodes before update # episode_per_update = UPDATE_EVERY * parallel_envs common_folder = time.strftime("/%m%d%y_%H%M%S") log_path = os.getcwd() + common_folder + "/log" model_dir = os.getcwd() + common_folder + "/model_dir" os.makedirs(model_dir, exist_ok=True) # initialize environment # torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK) # env = envs.make_env("simple_spread_ivan") # initialize replay buffer buffer = ReplayBuffer(int(BUFFER_SIZE)) # initialize policy and critic maddpg = MADDPG(num_agents=num_agents, discount_factor=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY) logger = SummaryWriter(log_dir=log_path) agents_reward = [] for n in range(num_agents): agents_reward.append([]) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] agent_info = [[[]]] # placeholder for benchmarking info # training loop # show progressbar import progressbar as pb widget = [ '\repisode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() print('Starting iterations...') for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, num_agents)) all_obs = env.reset() # # flip the first two indices # ADD FOR WITHOUT PARALLEL ENV # all_obs = np.expand_dims(all_obs, axis=0) obs_roll = np.rollaxis(all_obs, 1) obs = transpose_list(obs_roll) # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 # if save_info: # frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # get actions # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise = max(initial_noise * decay**(episode_t / 20000), 0.001) # noise = max(noise*noise_reduction, 0.001) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # environment step # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) # ADD FOR WITHOUT PARALLEL ENV # next_obs, rewards, dones, info = env.step(actions_for_env) next_obs, rewards, dones, info = env.step(actions_for_env) # rewards_sum += np.mean(rewards) # collect experience transition = (obs, actions_for_env, rewards, next_obs, dones) buffer.push(transition) reward_this_episode += rewards # obs, obs_full = next_obs, next_obs_full obs = next_obs # increment global step counter t += parallel_envs # save gif frame if save_info: # frames.append(env.render('rgb_array')) tmax += 1 # for benchmarking learned policies if BENCHMARK: for i, inf in enumerate(info): agent_info[-1][i].append(inf['n']) # update once after every episode_per_update # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs: if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs: for _ in range(UPDATE_TIMES): for a_i in range(num_agents): samples = buffer.sample(BATCH_SIZE) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): for n in range(num_agents): agents_reward[n].append(reward_this_episode[i, n]) # agent0_reward.append(reward_this_episode[i,0]) # agent1_reward.append(reward_this_episode[i,1]) # agent2_reward.append(reward_this_episode[i,2]) if episode % 100 == 0 or episode == number_of_episodes - 1: # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)] avg_rewards = [] for n in range(num_agents): avg_rewards.append(np.mean(agents_reward[n])) # agent0_reward = [] # agent1_reward = [] # agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: print('agent_info benchmark=', agent_info) for i in range(5): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), # frames, duration=.04) env.close() logger.close() timer.finish()
ep_loss = [] ep_error = [] # Initialize the environment and state state = torch.tensor([env.reset()], device=device).float() done = False score = 0 for t in count(): # Select and perform an action action = select_action(state) next_state, reward, done, _ = env.step(action.item()) score += reward next_state = torch.tensor([next_state], device=device).float() reward = torch.tensor([reward], device=device).float() # Store the transition in memory buffer.push(state, action, next_state, reward, not done) # Update state state = next_state # Perform one optimization step (on the policy network) loss, Q_estimation_error = train_model() # save results ep_loss.append(loss) ep_error.append(Q_estimation_error) # soft target update if params.target_update == 'soft': # print('in soft') # 0' ← τθ + (1 − τ )θ'
maddpg.reset_ounoise() # GET ACTIONS TO TAK ADN INTERACT WITH THE ENVIRONMENT actions = maddpg.act(tensorfy(states), noise=noise, stacked=True) env_info = env.step(actions)[brain_name] # EXTRACT AND PROCESS THE RETRUNED VALUES FROM ENVIRONMENT next_states = process_agent_states(env_info.vector_observations) next_global_state = process_gobal_state(env_info.vector_observations) rewards = env_info.rewards dones = env_info.local_done # ADD EXPERIENCE TO THE BUFFER experience = (states, global_state, actions, rewards, next_states, next_global_state, dones) buffer.push(experience) # UPDATE REWARDS rewards_this_episode += rewards # PREPARE FOR NEXT TIMESTEP states = next_states global_state = next_global_state noise = noise if hard_noise_reigime else noise * NOISE_DECAY # END EPISODE IF ANY AGENT IS DONE if any(dones): break if episode_i > HARD_NOISE_STEPS: hard_noise_reigime = False
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 1000 episode_length = 80 batchsize = 1000 # how many episodes to save policy and gif save_interval = 1000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes + parallel_envs, parallel_envs): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) all_obs = env.reset() obs, obs_full = transpose_list(all_obs) # for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) # soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
def main(): env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] seeding() # number of parallel agents #parallel_envs = num_agents # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 update_actor_after = 100 update_actor_every = 2 episode_length = 100 batchsize = 100 # how many episodes to save policy and gif save_interval = 1000 t = 0 LR_ACTOR = 1e-5 LR_CRITIC = 3e-3 # amplitude of OU noise # this slowly decreases to 0 noise = 1.0 noise_reduction = 0.999999 # how many episodes before update episode_per_update = 1 no_of_updates_perTime = 1 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) #torch.set_num_threads(parallel_envs) #env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(10 * episode_length)) # initialize policy and critic maddpg = MADDPG(lr_actor=LR_ACTOR, lr_critic=LR_CRITIC) #logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] #agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in range(0, number_of_episodes): timer.update(episode) env_info = env.reset( train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(num_agents) # initialize the score (for each agent) reward_this_episode = np.zeros((1, num_agents)) #all_obs = env.reset() # obs = states obs_full = np.concatenate((states[0], states[1])) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < 1 or episode == number_of_episodes - 1) tmax = 0 #resetting noise for i in range(num_agents): maddpg.maddpg_agent[i].noise.reset() for episode_t in range(episode_length): t += 1 update_act = True if (episode > update_actor_after or episode % update_actor_every == 0) else False # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensorAsitis(obs), noise=noise, batch=False) noise *= noise_reduction actions_array = torch.stack(actions).cpu().detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame env_info = env.step(actions_for_env)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards rewards_for_env = np.hstack(rewards) obs = states obs_full = np.concatenate((states[0], states[1])) next_obs = next_states next_obs_full = np.concatenate((next_states[0], next_states[1])) # add data to buffer transition = (np.array([obs]), np.array([obs_full]), np.array([actions_for_env]), np.array([rewards_for_env]), np.array([next_obs]), np.array([next_obs_full]), np.array([dones], dtype='float')) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for _ in range(no_of_updates_perTime): for a_i in range(num_agents): samples = buffer.sample(batchsize) #updating the weights of the n/w maddpg.update(samples, a_i, update_actor=update_act) maddpg.update_targets( ) #soft update the target network towards the actual networks if np.any(dones): # if the episode is done the loop is break to the next episode break for i in range(num_agents): agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)] agent0_reward = [] agent1_reward = [] for a_i, avg_rew in enumerate(avg_rewards): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) print('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files #imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), #frames, duration=.04) timer.finish()
class DQNAgent: """ DQN Agent, valid for discrete actioin space """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #loss_fn = nn.MSELoss() loss_fn = nn.SmoothL1Loss() iter = 0 def __init__(self, net, o_dim, a_dim, lr=1e-3, batch_size=16, algorithm="ddqn", gamma=0.99, tau=1e-3, buffer_size=int(1e6)): """ o_dim: observation space dim (or # of channels) a_dim: action space dimension """ self.o_dim = o_dim self.a_dim = a_dim self.lr = lr self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size if algorithm.lower() in ("dqn"): self.algorithm = "dqn" elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"): self.algorithm = "ddqn" else: raise TypeError("cannot recognize algorithm") self.buffer = ReplayBuffer(buffer_size, batch_size) self.online_net = net(o_dim, a_dim).to(self.device) self.target_net = net(o_dim, a_dim).to(self.device) self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr) def get_action(self, state, eps=0.): """ Epsilon-greedy action selection """ if random.random() > eps: state_tensor = torch.FloatTensor(state).unsqueeze(0).to( self.device) self.online_net.eval() with torch.no_grad(): action = self.online_net(state_tensor).argmax(1).item() self.online_net.train() return action else: return random.choice(np.arange(self.a_dim)) def update(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.FloatTensor(states).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) actions = torch.LongTensor(actions).view(-1, 1).to(self.device) rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device) dones = torch.FloatTensor(dones).view(-1, 1).to(self.device) if self.algorithm == "ddqn": max_actions = self.online_net(next_states).max(1)[1].view(-1, 1) Q_next = self.target_net(next_states).gather(1, max_actions) elif self.algorithm == "dqn": Q_next = self.target_net(next_states).max(1)[0].view(-1, 1) else: raise TypeError("cannot recognize algorithm") Q_targets = rewards + self.gamma * Q_next * (1. - dones) Q_expected = self.online_net(states).gather(1, actions) loss = self.loss_fn(Q_expected, Q_targets.detach()) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.online_net.parameters(), 10.) self.optimizer.step() def step(self, state, action, reward, next_state, done): self.buffer.push(state, action, reward, next_state, done) if len(self.buffer) > self.batch_size: experiences = self.buffer.sample() self.update(experiences) soft_update(self.target_net, self.online_net, self.tau) self.iter += 1
def main(): seeding() parallel_envs = 4 number_of_episodes = 1000 episode_length = 80 batchsize = 1000 save_interval = 1000 t = 0 # amplitude of OU noise, which slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) """ `env` controls three agents, two blue, one red. env.observation_space: [Box(14,), Box(14,), Box(14,)] env.action_sapce: [Box(2,), Box(2,), Box(2,)] Box(14,) can be broken down into 2+3*2+3*2=14 (2) location coordinates of the target landmark (3*2) the three agents' positions w.r.t. the target landmark (3*2) the three agents' velocities w.r.t. the target landmark """ env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): timer.update(episode) reward_this_episode = np.zeros((parallel_envs, 3)) # Consult `env_wrapper.py` line 19. all_obs = env.reset() """ `all_abs` is a list of size `parallel_envs`, each item in the list is another list of size two, first is env.observation_space: [Box(14,), Box(14,), Box(14,)], second is [Box(14,)], which is added to faciliate training https://goo.gl/Xtr6sF `obs` and `obs_full` are both lists of size `parallel_envs`, `obs` has the default observation space [Box(14,), Box(14,), Box(14,)] `obs_full` has the compounded observation space [Box(14,)] """ obs, obs_full = transpose_list(all_obs) # for calculating rewards for one episode - addition of all time steps # save info or not save_info = ((episode) % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): t += parallel_envs # explore = only explore for a certain number of steps # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # `actions_array` has shape (3, parallel_envs, 2) actions_array = torch.stack(actions).detach().numpy() # `actions_for_env` has shape (parallel_envs, 3, 2), because # input to `step` requires the first index to be `parallel_envs` actions_for_env = np.rollaxis(actions_array, axis=1) # step forward one frame next_obs, next_obs_full, rewards, dones, info = \ env.step(actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update the target network `parallel_envs`=4 times # after every `episode_per_update`=2*4 if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: # update the local network for all agents, `a_i` refers to agent no. for a_i in range(3): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network towards the actual networks maddpg.update_targets() for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) # Saves the model. save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # Save gif files. imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
class MADDPGAgent: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") iter = 0 def __init__(self, num_agents, x_dim, o_dim, a_dim, lr_actor=1e-3, lr_critic=1e-3, batch_size=16, gamma=0.99, tau=0.001, buffer_size=int(1e5), seed=1234): self.num_agents = num_agents self.x_dim = x_dim self.o_dim = o_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.seed = seed self.buffer = ReplayBuffer(buffer_size, batch_size, seed) self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \ for id in range(num_agents)] def get_actions(self, obs_full, eps=0.): """get actions from all agents in the MADDPG object""" actions = [] for id, agent in enumerate(self.agents): actions.extend(agent.get_action2(obs_full[id, :], eps)) return actions def update(self, experiences): obs_full, actions, rewards, next_obs_full, dones = experiences rewards = torch.FloatTensor(rewards).to(self.device) dones = torch.FloatTensor(dones).to(self.device) x = torch.FloatTensor(obs_full).to(self.device) a = torch.FloatTensor(actions).to(self.device) next_x = torch.FloatTensor(next_obs_full).to(self.device) with torch.no_grad(): next_a = [ agent.target_actor(next_x[:, agent.id, :]) for agent in self.agents ] next_a = torch.cat(next_a, dim=1) for agent in self.agents: r = rewards[:, agent.id].view(-1, 1) d = dones[:, agent.id].view(-1, 1) pred_a = [ self.agents[i].actor(x[:, i, :]) if i == agent.id \ else self.agents[i].actor(x[:, i, :]).detach() for i in range(self.num_agents) ] pred_a = torch.cat(pred_a, dim=1) agent.update(next_x, next_a, r, d, x, a, pred_a) def update_targets(self): """soft update targets""" for agent in self.agents: soft_update(agent.target_actor, agent.actor, self.tau) soft_update(agent.target_critic, agent.critic, self.tau) def step(self, state, action, reward, next_state, done): self.buffer.push(state, action, reward, next_state, done) if (len(self.buffer) > self.batch_size): experiences = self.buffer.sample() self.update(experiences) self.update_targets() self.iter += 1 def reset(self): for agent in self.agents: agent.noise.reset()
class SAC_Agent: def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.') def train(self): remove_log_file() clear_action_logs() eps_n = 0 rewards = [] test_rewards = [] best_reward = -np.inf info = None for eps_n in range(1, self.args.max_eps + 1): # Train loop self.set_mode('train') relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0 state = self.env.reset(relaunch=relaunch, render=False, sampletrack=False) eps_r = 0 sigma = (self.args.start_sigma - self.args.end_sigma) * (max( 0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma, self.action_size) for step in range(self.args.max_eps_time): # Episode action = self.policy_net.get_train_action(state, randomprocess) next_state, reward, done, info = self.env.step(action) self.buffer.push(state, action, reward, next_state, done) state = next_state eps_r += reward if len(self.buffer) > self.args.batch_size: self.update() if done: break rewards.append(eps_r) test_reward = self.test(eps_n) test_rewards.append(test_reward) if test_reward > best_reward: best_reward = test_reward self.save_checkpoint(eps_n, best_reward) info_str = ', '.join( [key for key in info.keys() if key != 'place']) info_str += f", {info['place']}. place" log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}' ) if eps_n % self.args.plot_per == 0: self.plot(rewards, test_rewards, eps_n) def update(self): state, action, reward, next_state, done = self.buffer.sample( self.args.batch_size) state = FloatTensor(state).to(self.args.device) next_state = FloatTensor(next_state).to(self.args.device) action = FloatTensor(action).to(self.args.device) reward = FloatTensor(reward).unsqueeze(1).to(self.args.device) done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device) predicted_q_value1 = self.soft_q_net1(state, action) predicted_q_value2 = self.soft_q_net2(state, action) predicted_value = self.value_net(state) new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate( state) # Training Q function target_value = self.target_value_net(next_state) target_q_value = reward + (1 - done) * self.args.gamma * target_value q_value_loss1 = self.soft_q_loss1(predicted_q_value1, target_q_value.detach()) q_value_loss2 = self.soft_q_loss2(predicted_q_value2, target_q_value.detach()) self.soft_q_opt1.zero_grad() q_value_loss1.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net1.parameters()) self.soft_q_opt1.step() self.soft_q_opt2.zero_grad() q_value_loss2.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net2.parameters()) self.soft_q_opt2.step() # Training Value function predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action), self.soft_q_net2(state, new_action)) target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum( ) value_loss = self.value_criterion(predicted_value, target_value_func.detach()) self.value_opt.zero_grad() value_loss.backward() if self.args.clipgrad: self.clip_grad(self.value_net.parameters()) self.value_opt.step() # Training Policy function policy_loss = (log_prob - predicted_new_q_value).mean() self.policy_opt.zero_grad() policy_loss.backward() if self.args.clipgrad: self.clip_grad(self.policy_net.parameters()) self.policy_opt.step() # Updating target value network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.args.soft_tau) + param.data * self.args.soft_tau) def test(self, eps_n): self.set_mode('eval') rewards = [] for step in range(self.args.test_rate): render = (eps_n % 30 == 0) and (step == 0) relaunch = render or ((eps_n % 30 == 0) and (step == 1)) state = self.env.reset(relaunch=relaunch, render=render, sampletrack=False) running_reward = 0 for t in range(self.args.max_eps_time): action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) store(action, eps_n, reward, info, t == 0) running_reward += reward if done: break rewards.append(running_reward) avg_reward = sum(rewards) / self.args.test_rate return avg_reward def plot(self, rewards, test_rewards, eps_n): torch.save({ 'train_rewards': rewards, 'test_rewards': test_rewards }, f'{self.plot_folder}/{eps_n}.pth') figure = plt.figure() plt.plot(rewards, label='Train Rewards') plt.plot(test_rewards, label='Test Rewards') plt.xlabel('Episode') plt.legend() plt.savefig(f'{self.plot_folder}/{eps_n}.png') try: send_mail(f'Improved Torcs SAC | Episode {eps_n}', f'{self.plot_folder}/{eps_n}.png') log('Mail has been sent.') except (KeyboardInterrupt, SystemExit): print('KeyboardInterrupt or SystemExit') raise except Exception as e: print('Mail Exception occured:', e) emsg = e.args[-1] emsg = emsg[:1].lower() + emsg[1:] log('Couldn\'t send mail because', emsg) def clip_grad(self, parameters): for param in parameters: param.grad.data.clamp_(-1, 1) def set_mode(self, mode): if mode == 'train': self.value_net.train() self.target_value_net.train() self.soft_q_net1.train() self.soft_q_net2.train() self.policy_net.train() elif mode == 'eval': self.value_net.eval() self.target_value_net.eval() self.soft_q_net1.eval() self.soft_q_net2.eval() self.policy_net.eval() else: raise ValueError('mode should be either train or eval') def save_checkpoint(self, eps_n, test_reward): self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2, self.policy_net) self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth') log(f'Saved checkpoint at episode {eps_n}.') def load_checkpoint(self, load_from): state_dicts = torch.load(load_from) self.value_net.load_state_dict(state_dicts['best_value']) self.soft_q_net1.load_state_dict(state_dicts['best_q1']) self.soft_q_net2.load_state_dict(state_dicts['best_q2']) self.policy_net.load_state_dict(state_dicts['best_policy']) print(f'Loaded from {load_from}.') def race(self, sampletrack=True): with torch.no_grad(): state = self.env.reset(relaunch=True, render=True, sampletrack=sampletrack) running_reward = 0 done = False while not done: action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) running_reward += reward print('Reward:', running_reward)
class FQFAgent: def __init__(self, env_name, num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001, state_embedding_dim=3136, quantile_embedding_dim=64, gamma=0.99, n_frames=4, batch_size=32, buffer_size=1000000, update_period=8, target_update_period=10000): self.env_name = env_name self.num_quantiles = num_quantiles self.state_embedding_dim = state_embedding_dim self.quantile_embedding_dim = quantile_embedding_dim self.k = 1.0 self.ent_coef = ent_coef self.n_frames = n_frames self.action_space = gym.make(self.env_name).action_space.n self.fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self.target_fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self._define_network() self.optimizer = tf.keras.optimizers.Adam( lr=0.00015, epsilon=0.01/32) #: fpl; fraction proposal layer self.optimizer_fpl = tf.keras.optimizers.Adam( learning_rate=0.00005 * fqf_factor, epsilon=0.0003125) self.gamma = gamma self.replay_buffer = ReplayBuffer(max_len=buffer_size) self.batch_size = batch_size self.update_period = update_period self.target_update_period = target_update_period self.steps = 0 def _define_network(self): """ initialize network weights """ env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) state = np.stack(frames, axis=2)[np.newaxis, ...] self.fqf_network(state) self.target_fqf_network(state) self.target_fqf_network.set_weights(self.fqf_network.get_weights()) @property def epsilon(self): if self.steps <= 1000000: return max(0.99 * (1000000 - self.steps) / 1000000, 0.1) elif self.steps <= 2000000: return 0.05 + 0.05 * (2000000 - self.steps) / 2000000 else: return 0.05 def learn(self, n_episodes, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) for episode in range(1, n_episodes+1): env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: self.steps += 1 episode_steps += 1 state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.fqf_network.sample_action(state, epsilon=self.epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(frame_preprocess(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if done: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) break else: if info["ale.lives"] != lives: #: life loss as episode ends lives = info["ale.lives"] exp = Experience(state, action, reward, next_state, True) else: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) if (len(self.replay_buffer) > 50000) and (self.steps % self.update_period == 0): loss, loss_fp, entropy = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=self.steps) tf.summary.scalar("loss_fp", loss_fp, step=self.steps) tf.summary.scalar("entropy", entropy, step=self.steps) tf.summary.scalar("epsilon", self.epsilon, step=self.steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=self.steps) tf.summary.scalar("train_score", episode_rewards, step=self.steps) tf.summary.scalar("train_steps", episode_steps, step=self.steps) #: Target update if self.steps % self.target_update_period == 0: self.target_fqf_network.set_weights( self.fqf_network.get_weights()) print(f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}") if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=self.steps) tf.summary.scalar("test_step", test_steps[0], step=self.steps) if episode % 500 == 0: self.fqf_network.save_weights("checkpoints/fqfnet") print("Model Saved") def update_network(self): (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size) rewards = rewards.reshape((self.batch_size, 1, 1)) dones = dones.reshape((self.batch_size, 1, 1)) with tf.GradientTape() as tape: #: Compute F(τ^) state_embedded = self.fqf_network.state_embedding_layer(states) taus, taus_hat, taus_hat_probs = self.fqf_network.propose_fractions(state_embedded) taus_hat, taus_hat_probs = tf.stop_gradient(taus_hat), tf.stop_gradient(taus_hat_probs) quantiles = self.fqf_network.quantile_function( state_embedded, taus_hat) actions_onehot = tf.one_hot( actions.flatten().astype(np.int32), self.action_space) actions_mask = tf.expand_dims(actions_onehot, axis=2) quantiles = tf.reduce_sum( quantiles * actions_mask, axis=1, keepdims=True) #: Compute target F(τ^), use same taus proposed by online network next_actions, target_quantiles = self.target_fqf_network.greedy_action_on_given_taus( next_states, taus_hat, taus_hat_probs) next_actions_onehot = tf.one_hot(next_actions.numpy().flatten(), self.action_space) next_actions_mask = tf.expand_dims(next_actions_onehot, axis=2) target_quantiles = tf.reduce_sum( target_quantiles * next_actions_mask, axis=1, keepdims=True) #: TF(τ^) target_quantiles = rewards + self.gamma * (1-dones) * target_quantiles target_quantiles = tf.stop_gradient(target_quantiles) #: Compute Quantile regression loss target_quantiles = tf.repeat( target_quantiles, self.num_quantiles, axis=1) quantiles = tf.repeat( tf.transpose(quantiles, [0, 2, 1]), self.num_quantiles, axis=2) #: huberloss bellman_errors = target_quantiles - quantiles is_smaller_than_k = tf.abs(bellman_errors) < self.k squared_loss = 0.5 * tf.square(bellman_errors) linear_loss = self.k * (tf.abs(bellman_errors) - 0.5 * self.k) huberloss = tf.where(is_smaller_than_k, squared_loss, linear_loss) #: quantile loss indicator = tf.stop_gradient(tf.where(bellman_errors < 0, 1., 0.)) _taus_hat = tf.repeat( tf.expand_dims(taus_hat, axis=2), self.num_quantiles, axis=2) quantile_factors = tf.abs(_taus_hat - indicator) quantile_huberloss = quantile_factors * huberloss loss = tf.reduce_mean(quantile_huberloss, axis=2), loss = tf.reduce_sum(loss, axis=1) loss = tf.reduce_mean(loss) state_embedding_vars = self.fqf_network.state_embedding_layer.trainable_variables quantile_function_vars = self.fqf_network.quantile_function.trainable_variables variables = state_embedding_vars + quantile_function_vars grads = tape.gradient(loss, variables) with tf.GradientTape() as tape2: taus_all = self.fqf_network.fraction_proposal_layer(state_embedded) taus = taus_all[:, 1:-1] quantiles = self.fqf_network.quantile_function( state_embedded, taus) taus_hat = (taus_all[:, 1:] + taus_all[:, :-1]) / 2. quantiles_hat = self.fqf_network.quantile_function( state_embedded, taus_hat) dw_dtau = 2 * quantiles - quantiles_hat[:, :, 1:] - quantiles_hat[:, :, :-1] dw_dtau = tf.reduce_sum(dw_dtau * actions_mask, axis=1) entropy = tf.reduce_sum(-1 * taus_hat * tf.math.log(taus_hat), axis=1) loss_fp = tf.reduce_mean(tf.square(dw_dtau), axis=1) loss_fp += -1 * self.ent_coef * entropy loss_fp = tf.reduce_mean(loss_fp) fp_variables = self.fqf_network.fraction_proposal_layer.trainable_variables grads_fp = tape2.gradient(loss_fp, fp_variables) self.optimizer.apply_gradients(zip(grads, variables)) self.optimizer_fpl.apply_gradients(zip(grads_fp, fp_variables)) return loss, loss_fp, tf.reduce_mean(entropy) def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) state = np.stack(frames, axis=2)[np.newaxis, ...] self.fqf_network(state) self.fqf_network.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor( gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.fqf_network.sample_action(state, epsilon=0.01) next_frame, reward, done, _ = env.step(action) frames.append(frame_preprocess(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps
def main(): seeding() # number of training episodes. number_of_episodes = 5000 episode_length = 1000 batchsize = 2000 t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe') env = UnityEnvironment('Tennis_Windows_x86_64/Tennis.exe', no_graphics=True) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) replay_episodes = 1000 buffer = ReplayBuffer(int(replay_episodes * episode_length)) # initialize policy and critic maddpg = MADDPG() # logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] # training loop scores_deque = deque(maxlen=100) scores = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros(num_agents) env_info = env.reset(True)[brain_name] state = env_info.vector_observations obs = [[state[0], state[1]]] obs_full = np.concatenate((state[0], state[1])) #for calculating rewards for this particular episode - addition of all time steps frames = [] tmax = 0 for episode_t in range(episode_length): t += 1 # explore = only explore for a certain number of episodes # action input needs to be transposed actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # actions_for_env = np.rollaxis(actions_array,1) actions_for_env = np.clip(actions_array.flatten(), -1, 1) # print(actions_for_env) # step forward one frame # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env) env_info = env.step(actions_for_env)[brain_name] next_state = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done next_obs = [[next_state[0], next_state[1]]] next_obs_full = np.concatenate((next_state[0], next_state[1])) # print(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones # add data to buffer transition = ([obs], [obs_full], [actions_for_env], [rewards], [next_obs], [next_obs_full], [dones]) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize and episode % episode_per_update == 0: for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) #soft update the target network towards the actual networks avg_rewards = np.mean(reward_this_episode, axis=0) episode_reward = np.max(avg_rewards) scores_deque.append(episode_reward) scores.append(episode_reward) print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward), end="") if (episode > 0 and episode % 100 == 0) or episode == number_of_episodes - 1: print('\rEpisode {}\tAverage Score: {:.3f}\tEpisode Score: {:.3f}'. format(episode, np.mean(scores_deque), episode_reward)) if np.mean(scores_deque) >= 0.5: print('\nSuccess!') break #saving model save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) env.close() fig = plt.figure() ax = fig.add_subplot(111) plt.ylabel('Score') plt.xlabel('Episode #') plt.plot(np.arange(1, len(scores) + 1), scores) plt.savefig('tennis_score_history.png') return scores
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 5000 max_t = 1000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor # how many episodes before update episode_per_update = 2 model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents * 2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG(discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] # when to save: use a dictionary to track if a model at a given score (key/10) has been saved. save_on_scores = { 5: False, 6: False, 9: False, 10: False, 11: False, 12: False, 13: False, 14: False, 15: False, 16: False, 17: False, 18: False, 19: False, 20: False } agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) obs, obs_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() for episode_t in range(max_t): # explore = only explore for a certain number of episodes # action input needs to be transposed #print('Obs:', obs) actions = maddpg.act(torch.tensor(obs, dtype=torch.float), noise=noise) #print(actions) #if noise>0.01: noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer buffer.push(obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) reward_this_episode += rewards obs = np.copy(next_obs) obs_full = np.copy(next_obs_full) # update once after every episode_per_update if len( buffer ) > batchsize and episode > 0 and episode % episode_per_update == 0: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) print( '\rEpisode:{}, Rwd:{:.3f} vs. {:.3f}, Average Score:{:.4f}, Noise:{:.4f}' .format(episode, reward_this_episode[0, 0], reward_this_episode[0, 1], cur_score, noise)) #saving model save_dict_list = [] save_info = False score_code = int(cur_score * 10) if score_code in save_on_scores.keys(): if not (save_on_scores[score_code]): save_on_scores[score_code] = True save_info = True if save_info: for i in range(number_of_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join( model_dir, 'episode-{}-{}.pt'.format(episode, score_code))) np.savez('scores-{}-{}.npz'.format(episode, score_code), agent0_reward=np.array(agent0_reward), agent1_reward=np.array(agent1_reward), avg_max_scores=np.array(ep_scores)) env.close()
class DDPGAgent: """ DDPG Agent, valid for continuous actioin space """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #loss_fn = nn.MSELoss() loss_fn = nn.SmoothL1Loss() iter = 0 def __init__(self, func1, func2, o_dim, a_dim, h_dim, initialize_weights = False, lr_actor = 1e-3, lr_critic = 1e-3, batch_size = 16, gamma = 0.99, tau = 0.001, buffer_size = int(1e5), seed = 1234): """ func1: actor model func2: critic model o_dim/c_dim: observation space dimension/ # of channels when image as input a_dim: action space dimension """ self.o_dim = o_dim self.a_dim = a_dim self.h_dim = h_dim self.initialize_weights = initialize_weights self.lr_actor = lr_actor self.lr_critic = lr_critic self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.seed = seed # Replay memory self.buffer = ReplayBuffer(buffer_size, batch_size, seed) # Actor Network (w/ Target Network) self.actor = func1(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device) self.target_actor = func1(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr = lr_actor) # Critic Network (w/ Target Network) self.critic = func2(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device) self.target_critic = func2(o_dim , a_dim, h_dim, initialize_weights, seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = lr_critic) # Noise process self.noise = OUNoise(a_dim) def get_action1(self, state, eps = 0.): """ action value ranges from -1 to 1 -- eps = 0. no exploration > 0. add exploration """ state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) self.actor.eval() with torch.no_grad(): action = self.actor(state_tensor)[0].detach().cpu().numpy() self.actor.train() action += self.noise.sample() * eps return np.clip(action, -1, 1) def get_action2(self, state, eps = 0.): """ slimevolly gym environment --- multibinary action space (although the action space is multi-binary, float vectors are accepted) forward = True if action[0]>0 else False backward = True if action[1]>0 elseTrue False jump = True if action[2]>0 else True False -- eps = 0. no exploration > 0. add exploration """ if random.random() > eps: state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) self.actor.eval() with torch.no_grad(): logits = self.actor(state_tensor).squeeze() action = torch.where(logits>0,torch.ones_like(logits),torch.zeros_like(logits)) self.actor.train() return action.detach().cpu().numpy() else: action = [random.choice([0,1]) for _ in range(self.a_dim)] return np.asarray(action, dtype = np.float32) def update(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.FloatTensor(states).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).view(-1, 1).to(self.device) dones = torch.FloatTensor(dones).view(-1, 1).to(self.device) self.iter += 1 # ---------------------------- update critic ---------------------------- # next_actions = self.target_actor(next_states) Q_next = self.target_critic(next_states, next_actions) Q_targets = rewards + self.gamma * Q_next * (1. -dones) Q_expected = self.critic(states, actions) critic_loss = self.loss_fn(Q_expected, Q_targets.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # pred_actions = self.actor(states) actor_loss = -self.critic(states, pred_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def update_targets(self): soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def step(self, state, action, reward, next_state, done): self.buffer.push(state, action, reward, next_state, done) if (len(self.buffer) > self.batch_size): experiences = self.buffer.sample() self.update(experiences) self.update_targets() self.iter += 1 def reset(self): self.noise.reset()
def main(): seeding() # number of parallel agents number_of_agents = 2 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 3000 batchsize = 128 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 tau = 1e-3 # soft update factor gamma = 0.99 # reward discount factor print_every = 100 # how many episodes before update episode_per_update = 2 #model_dir= os.getcwd()+"/model_dir" #os.makedirs(model_dir, exist_ok=True) result_dir= os.getcwd()+"/result_dir" os.makedirs(result_dir, exist_ok=True) # do we need to set multi-thread for this env? torch.set_num_threads(number_of_agents*2) env = TennisEnv() # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(1e5)) num_agents, num_states, num_actions = env.get_shapes() # initialize policy and critic maddpg = MADDPG(num_agents, num_states, num_actions, discount_factor=gamma, tau=tau) # training loop scores_window = deque(maxlen=100) ep_scores = [] agent0_reward = [] agent1_reward = [] for episode in range(0, number_of_episodes): reward_this_episode = np.zeros((1, number_of_agents)) states, states_full, env_info = env.reset() for agent in maddpg.maddpg_agent: agent.noise.reset() while True: actions = maddpg.act(torch.tensor(states, dtype=torch.float), noise=noise) noise *= noise_reduction actions_for_env = torch.stack(actions).detach().numpy() # step forward one frame next_states, next_states_full, rewards, dones, info = env.step(actions_for_env) # add data to buffer buffer.push(states, states_full, actions_for_env, rewards, next_states, next_states_full, dones) reward_this_episode += rewards states = np.copy(next_states) states_full = np.copy(next_states_full) # update once after every episode_per_update if len(buffer) > batchsize: for a_i in range(number_of_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) if np.any(dones): break agent0_reward.append(reward_this_episode[0, 0]) agent1_reward.append(reward_this_episode[0, 1]) avg_rewards = max(reward_this_episode[0, 0], reward_this_episode[0, 1]) scores_window.append(avg_rewards) cur_score = np.mean(scores_window) ep_scores.append(cur_score) save_dict_list =[] if episode % print_every == 0.0 or avg_rewards > 2.5: print('\rEpisode: {}, Average score: {:.5f}, noise: {:.5f}'.format(episode, cur_score, noise)) if avg_rewards > 2.5: for i in range(number_of_agents): save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-{}-{}.pt'.format(episode, cur_score))) print('model saved') break env.close() #print('main-ep_scores: ', ep_scores) fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(1, len(ep_scores)+1), ep_scores) plt.ylabel('Score') plt.xlabel('Episode #') fig.savefig(result_dir + '/score_plot.png')
def train(): seeding() os.environ["CUDA_VISIBLE_DEVICES"] = "1" print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) env = UnityEnvironment( file_name= '/home/slavo/Dev/deep-rl-projects/ma_collab-compet/Tennis_Linux/Tennis.x86_64', no_graphics=True) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) agents = len(env_info.agents) # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 30000 episode_length = 500 # how many steps before update steps_per_update = 100 # amplitude of OU noise # this slowly decreases to 0 noise = 1 noise_reduction = 0.9999 torch.set_num_threads(4) buffer = ReplayBuffer(BUFFER_SIZE) # initialize policy and critic maddpg_agent = MADDPG(state_size, action_size, agents) scores = [] scores_window = deque(maxlen=100) # last 100 scores actor_losses = [] critic_losses = [] for i in range(len(env_info.agents)): actor_losses.append([]) critic_losses.append([]) for episode in range(0, number_of_episodes): episode_rewards = [] env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations state_full = np.concatenate(state) # for calculating rewards for this particular episode - addition of all time steps for episode_t in range(episode_length + 1): actions = maddpg_agent.act(transpose_to_tensor(list(state)), noise=noise) noise *= noise_reduction actions = torch.stack(actions).view(-1).detach().cpu().numpy() env_info = env.step(actions)[brain_name] state_next = env_info.vector_observations # get the next state state_next_full = np.concatenate(state_next) rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished # add experiences to buffer transition = (state, state_full, actions, rewards, state_next, state_next_full, dones) buffer.push(transition) episode_rewards.append(rewards) state, state_full = state_next, state_next_full # update once after every steps_per_update if len(buffer) > BATCH_SIZE and (episode_t > 0) and ( episode_t % steps_per_update == 0): # print('maddpg update after {} steps'.format(episode_t)) for agent_idx in range(len(env_info.agents)): samples = buffer.sample(BATCH_SIZE) al, cl = maddpg_agent.update(samples, agent_idx) actor_losses[agent_idx].append(al) critic_losses[agent_idx].append(cl) maddpg_agent.update_targets( ) # soft update the target network towards the actual networks # calculate agent episode rewards agent_episode_rewards = [] for i in range(len(env_info.agents)): agent_episode_reward = 0 for step in episode_rewards: agent_episode_reward += step[i] agent_episode_rewards.append(agent_episode_reward) scores.append(np.max(agent_episode_rewards)) scores_window.append(np.max(agent_episode_rewards)) if episode > 10 and episode % 10 == 0: print( '\rEpisode {}\tAgent Rewards [{:.4f}\t{:.4f}]\tMax Reward {:.4f}' .format(episode, agent_episode_rewards[0], agent_episode_rewards[1], np.max(agent_episode_rewards))) print( '\rEpisode {}\tAverage Actor 1 Loss {:.6f}\tAverage Critic 1 Loss {:.6f}' '\tAverage Actor 2 Loss {:.6f}\tAverage Critic 2 Loss {:.6f}'. format(episode, np.mean(actor_losses[0]), np.mean(critic_losses[0]), np.mean(actor_losses[1]), np.mean(critic_losses[1]))) print('\rEpisode {}\tAverage Score: {:.4f}'.format( episode, np.mean(scores_window))) # reset losses actor_losses = [] critic_losses = [] for i in range(len(env_info.agents)): actor_losses.append([]) critic_losses.append([]) if episode > 100 and episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.4f}'.format( episode, np.mean(scores_window))) if episode > 100 and np.mean(scores_window) >= 0.5: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}' .format(episode - 100, np.mean(scores_window))) for i, save_agent in enumerate(maddpg_agent.agents): torch.save(save_agent.actor.state_dict(), './checkpoints/checkpoint_actor_' + str(i) + '.pth') torch.save( save_agent.critic.state_dict(), './checkpoints/checkpoint_critic_' + str(i) + '.pth') break env.close() return scores
class DQNAgent: def __init__(self, env_name="BreakoutDeterministic-v4", gamma=0.99, batch_size=32, lr=0.00025, update_period=4, target_update_period=10000, n_frames=4): self.env_name = env_name self.gamma = gamma self.batch_size = batch_size self.epsilon_scheduler = ( lambda steps: max(1.0 - 0.9 * steps / 1000000, 0.1)) self.update_period = update_period self.target_update_period = target_update_period env = gym.make(self.env_name) self.action_space = env.action_space.n self.qnet = QNetwork(self.action_space) self.target_qnet = QNetwork(self.action_space) self.optimizer = Adam(lr=lr, epsilon=0.01 / self.batch_size) self.n_frames = n_frames self.use_reward_clipping = True self.huber_loss = tf.keras.losses.Huber() def learn(self, n_episodes, buffer_size=1000000, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) self.replay_buffer = ReplayBuffer(max_len=buffer_size) steps = 0 for episode in range(1, n_episodes + 1): env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: steps, episode_steps = steps + 1, episode_steps + 1 epsilon = self.epsilon_scheduler(steps) state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(preprocess_frame(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if info["ale.lives"] != lives: lives = info["ale.lives"] transition = (state, action, reward, next_state, True) else: transition = (state, action, reward, next_state, done) self.replay_buffer.push(transition) if len(self.replay_buffer) > 50000: if steps % self.update_period == 0: loss = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=steps) tf.summary.scalar("epsilon", epsilon, step=steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=steps) tf.summary.scalar("train_score", episode_rewards, step=steps) tf.summary.scalar("train_steps", episode_steps, step=steps) if steps % self.target_update_period == 0: self.target_qnet.set_weights(self.qnet.get_weights()) if done: break print( f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}" ) if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=steps) tf.summary.scalar("test_step", test_steps[0], step=steps) if episode % 1000 == 0: self.qnet.save_weights("checkpoints/qnet") def update_network(self): #: ミニバッチの作成 (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size) if self.use_reward_clipping: rewards = np.clip(rewards, -1, 1) next_actions, next_qvalues = self.target_qnet.sample_actions( next_states) next_actions_onehot = tf.one_hot(next_actions, self.action_space) max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) target_q = rewards + self.gamma * (1 - dones) * max_next_qvalues with tf.GradientTape() as tape: qvalues = self.qnet(states) actions_onehot = tf.one_hot(actions.flatten().astype(np.int32), self.action_space) q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) loss = self.huber_loss(target_q, q) grads = tape.gradient(loss, self.qnet.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.qnet.trainable_variables)) return loss def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=0.05) next_frame, reward, done, _ = env.step(action) frames.append(preprocess_frame(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps
def train(env, model_path='model_dir', number_of_episodes=50000, episode_length=500): noise = 1.0 noise_reduction = 1.0 batchsize = 256 model_dir = os.getcwd() + "/" + model_path model_files = glob.glob(model_dir + "/*.pt") for file in model_files: os.remove(file) os.makedirs(model_dir, exist_ok=True) buffer = ReplayBuffer(int(1e5)) rewards_deque = deque(maxlen=100) rewards_total = [] # initialize policy and critic maddpg = MADDPG() for episode in range(1, number_of_episodes + 1): rewards_this_episode = np.asarray([0.0, 0.0]) env_info = env.reset(train_mode=True)[brain_name] obs = env_info.vector_observations for episode_t in range(episode_length): actions = maddpg.act(obs, noise=noise) noise *= noise_reduction env_info = env.step(actions)[brain_name] next_obs = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (obs, actions, rewards, next_obs, dones) buffer.push(transition) rewards_this_episode += rewards obs = next_obs if any(dones): break # update once after every episode_per_update if len(buffer) > batchsize * 4: for _ in range(4): for a_i in range(num_agents): samples = buffer.sample(batchsize) maddpg.update(samples, a_i) maddpg.update_targets( ) # soft update the target network towards the actual networks rewards_total.append(np.max(rewards_this_episode)) rewards_deque.append(rewards_total[-1]) average_score = np.mean(rewards_deque) print(episode, rewards_this_episode, rewards_total[-1], average_score) # saving model save_dict_list = [] if episode % 1000 == 0: for i in range(2): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) torch.save(maddpg.maddpg_agent[0].actor.state_dict(), 'actor0.pt') torch.save(maddpg.maddpg_agent[1].actor.state_dict(), 'actor1.pt') torch.save(maddpg.maddpg_agent[0].critic.state_dict(), 'critic0.pt') torch.save(maddpg.maddpg_agent[1].critic.state_dict(), 'critic1.pt') return rewards_total
def run(config): data_folder = Path(config.data_path) building_attributes = data_folder / 'building_attributes.json' solar_profile = data_folder / 'solar_generation_1kW.csv' building_state_actions = 'buildings_state_action_space.json' # building_ids = ["Building_" + str(i) for i in range(1, config.num_buildings + 1)] config.num_buildings = 6 # customized log directory hidden = config.hidden_dim lr = config.lr tau = config.tau gamma = config.gamma batch_size = config.batch_size buffer_length = config.buffer_length to_print = lambda x: str(x) log_path = "log"+"_hidden"+to_print(hidden)+"_lr"+to_print(lr)+"_tau"+to_print(tau)+"_gamma"+to_print(gamma)+\ "_batch_size"+to_print(batch_size)+"_buffer_length"+to_print(buffer_length)+"_TIME_PERIOD_1008_MAXACTION_25"+"/" logger = SummaryWriter(log_dir=log_path) # TODO fix here building_ids = ["Building_" + str(i) for i in [1, 2, 5, 6, 7, 8]] #[1,2,5,6,7,8] env = CityLearn(building_attributes, solar_profile, building_ids, buildings_states_actions=building_state_actions, cost_function=[ 'ramping', '1-load_factor', 'peak_to_valley_ratio', 'peak_demand', 'net_electricity_consumption' ]) observations_spaces, actions_spaces = env.get_state_action_spaces() # Instantiating the control agent(s) if config.agent_alg == 'MADDPG': agents = MA_DDPG(observations_spaces, actions_spaces, hyper_params=vars(config)) else: raise NotImplementedError k, c = 0, 0 cost, cum_reward = {}, {} buffer = ReplayBuffer(max_steps=config.buffer_length, num_agents=config.num_buildings, obs_dims=[s.shape[0] for s in observations_spaces], ac_dims=[a.shape[0] for a in actions_spaces]) # TODO: store np or tensor in buffer? start = time.time() for e in range(config.n_episodes): cum_reward[e] = 0 rewards = [] state = env.reset() statecast = lambda x: [torch.FloatTensor(s) for s in x] done = False ss = 0 while not done: if k % (40000 * 4) == 0: print('hour: ' + str(k) + ' of ' + str(TIME_PERIOD * config.n_episodes)) action = agents.select_action(statecast(state), explore=False) action = [a.detach().numpy() for a in action] # if batch norm: action = [np.squeeze(a, axis=0) for a in action] ss += 1 #print("action is ", action) #print(action[0].shape) #raise NotImplementedError next_state, reward, done, _ = env.step(action) reward = reward_function( reward) # See comments in reward_function.py #buffer_reward = [-r for r in reward] # agents.add_to_buffer() buffer.push(statecast(state), action, reward, statecast(next_state), done) # if (len(buffer) >= config.batch_size and # (e % config.steps_per_update) < config.n_rollout_threads): if len(buffer) >= config.batch_size: if USE_CUDA: agents.to_train(device='gpu') else: agents.to_train(device='cpu') for a_i in range(agents.n_buildings): sample = buffer.sample(config.batch_size, to_gpu=USE_CUDA) agents.update(sample, a_i, logger=logger, global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='net electric consumption', scalar_value=env.net_electric_consumption[-1], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag='env cost total', scalar_value=env.cost()['total'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="1 load factor", scalar_value=env.cost()['1-load_factor'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak to valley ratio", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e * TIME_PERIOD + ss) logger.add_scalar(tag="peak demand", scalar_value=env.cost()['peak_demand'], global_step=e * TIME_PERIOD + ss) logger.add_scalar( tag="net energy consumption", scalar_value=env.cost()['net_electricity_consumption'], global_step=e * TIME_PERIOD + ss) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e * TIME_PERIOD + ss) for id, r in enumerate(reward): logger.add_scalar(tag="agent {} reward ".format(id), scalar_value=r, global_step=e * TIME_PERIOD + ss) state = next_state cum_reward[e] += reward[0] k += 1 cur_time = time.time() # print("average time : {}s/iteration at iteration {}".format((cur_time - start) / (60.0 * k), k)) cost[e] = env.cost() if c % 1 == 0: print(cost[e]) # add env total cost and reward logger logger.add_scalar(tag='env cost total final', scalar_value=env.cost()['total'], global_step=e) logger.add_scalar(tag="1 load factor final", scalar_value=env.cost()['1-load_factor'], global_step=e) logger.add_scalar(tag="peak to valley ratio final", scalar_value=env.cost()['peak_to_valley_ratio'], global_step=e) logger.add_scalar(tag="peak demand final", scalar_value=env.cost()['peak_demand'], global_step=e) logger.add_scalar( tag="net energy consumption final", scalar_value=env.cost()['net_electricity_consumption'], global_step=e) net_energy_consumption_wo_storage = env.net_electric_consumption[ -1] + env.electric_generation[ -1] - env.electric_consumption_cooling_storage[ -1] - env.electric_consumption_dhw_storage[-1] logger.add_scalar(tag="net energy consumption without storage", scalar_value=net_energy_consumption_wo_storage, global_step=e) c += 1 rewards.append(reward) end = time.time() print((end - start) / 60.0)
def maddpg(n_episodes=50000, max_t=1000, print_every=100, batchsize=128): seeding() buffer = ReplayBuffer(int(50000 * max_t)) noise = 2 noise_reduction = 0.9999 scores_deque = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes + 1): scores_agents = np.zeros(num_agents) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations while True: # agent chooses actions states_converted_to_tensor = convert_to_tensor(states) actions = agent.act(states_converted_to_tensor, noise=noise) noise *= noise_reduction actions_array = torch.stack(actions).detach().numpy() # environment takes action and returns new states and rewards env_info = env.step(actions_array)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # store in shared replay buffer experience = (states, actions_array, rewards, next_states, dones) buffer.push(experience) # update agent with experience sample if len(buffer) > batchsize: for a_i in range(2): samples = buffer.sample(batchsize) agent.update(samples, a_i) agent.update_targets( ) # soft update the target network towards the actual networks # update episode score with agent rewards scores_agents += rewards states = next_states if np.any(dones): break scores_deque.append(np.max(scores_agents)) scores.append(np.max(scores_agents)) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) >= 0.5 and i_episode >= 100: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_deque))) for i, maddpg_agent in zip(range(num_agents), agent.maddpg_agent): torch.save(maddpg_agent.actor.state_dict(), 'checkpoint_actor_{}.pth'.format(i)) torch.save(agent.critic.state_dict(), 'checkpoint_critic.pth') break return scores
def main(): seeding() number_of_episodes = 20000 episode_length = 1000 batchsize = 256 save_interval = 1000 rewards_deque = deque(maxlen=100) rewards_all = [] noise = 1.0 noise_reduction = 1.0 log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) """ Info about the UnityEnvironment brain_name: 'TennisBrain' brain: ['brain_name', 'camera_resolutions', 'num_stacked_vector_observations', 'number_visual_observations', 'vector_action_descriptions', 'vector_action_space_size', 'vector_action_space_type', 'vector_observation_space_size', 'vector_observation_space_type']] """ env = UnityEnvironment(file_name="Tennis.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] buffer = ReplayBuffer(int(1e5)) # initialize policy and critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) # ------------------------------ training ------------------------------ # # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() for episode in range(1, number_of_episodes + 1): timer.update(episode) rewards_this_episode = np.zeros((2, )) """ Info about the UnityEnvironment env_info: ['agents', 'local_done', 'max_reached', 'memories', 'previous_text_actions', 'previous_vector_actions', 'rewards', 'text_observations', 'vector_observations', 'visual_observations'] actions: List(num_agents=2, action_size=2) states: List((24,), (24,)) rewards: List(2,) dones: List(2,) """ env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations for episode_t in range(episode_length): # reset the OUNoise for each agent. for i in range(2): maddpg.maddpg_agent[i].noise.reset() actions = maddpg.act(states, noise=noise) env_info = env.step(actions)[brain_name] noise *= noise_reduction next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done # add data to buffer transition = (states, actions, rewards, next_states, dones) buffer.push(transition) rewards_this_episode += rewards states = next_states if any(dones): break # update the local and target network if len(buffer) > batchsize: # update the local network for _ in range(5): for a_i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) # soft update the target network maddpg.update_targets() rewards_all.append(rewards_this_episode) rewards_deque.append(np.max(rewards_this_episode)) average_score = np.mean(rewards_deque) # --------------------- Logging for TensorBoard --------------------- # logger.add_scalars('rewards', { 'agent0': rewards_this_episode[0], 'agent1': rewards_this_episode[1] }, episode) logger.add_scalars('global', { 'score': np.max(rewards_this_episode), 'average_score': average_score }, episode) # -------------------------- Save the model -------------------------- # save_dict_list = [] if episode % save_interval == 0 or average_score >= 0.5: for i in range(2): save_dict = \ {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()} save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) if average_score >= 3.0: print('\nEnvironment solved in {} episodes!'.format(episode - 100)) print('\nAverage Score: {:.2f}'.format(average_score)) break env.close() logger.close() timer.finish()
class SAC: MAX_EXPERIENCES = 100000 MIN_EXPERIENCES = 512 UPDATE_PERIOD = 4 GAMMA = 0.99 TAU = 0.005 BATCH_SIZE = 256 def __init__(self, env_id, action_space, action_bound): self.env_id = env_id self.action_space = action_space self.action_bound = action_bound self.env = gym.make(self.env_id) self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES) self.policy = GaussianPolicy(action_space=self.action_space, action_bound=self.action_bound) self.duqlqnet = DualQNetwork() self.target_dualqnet = DualQNetwork() self.log_alpha = tf.Variable(0.) #: alpha=1 self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4) self.target_entropy = -0.5 * self.action_space self.global_steps = 0 self._initialize_weights() def _initialize_weights(self): """1度callすることでネットワークの重みを初期化 """ env = gym.make(self.env_id) dummy_state = env.reset() dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.action_space) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.policy(dummy_state) self.duqlqnet(dummy_state, dummy_action) self.target_dualqnet(dummy_state, dummy_action) self.target_dualqnet.set_weights(self.duqlqnet.get_weights()) def play_episode(self): episode_reward = 0 local_steps = 0 done = False state = self.env.reset() while not done: action, _ = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) state = next_state episode_reward += reward local_steps += 1 self.global_steps += 1 if (len(self.replay_buffer) >= self.MIN_EXPERIENCES and self.global_steps % self.UPDATE_PERIOD == 0): self.update_networks() return episode_reward, local_steps, tf.exp(self.log_alpha) def update_networks(self): (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.BATCH_SIZE) alpha = tf.math.exp(self.log_alpha) #: Update Q-function next_actions, next_logprobs = self.policy.sample_action(next_states) target_q1, target_q2 = self.target_dualqnet(next_states, next_actions) target = rewards + (1 - dones) * self.GAMMA * ( tf.minimum(target_q1, target_q2) + -1 * alpha * next_logprobs ) with tf.GradientTape() as tape: q1, q2 = self.duqlqnet(states, actions) loss_1 = tf.reduce_mean(tf.square(target - q1)) loss_2 = tf.reduce_mean(tf.square(target - q2)) loss = 0.5 * loss_1 + 0.5 * loss_2 variables = self.duqlqnet.trainable_variables grads = tape.gradient(loss, variables) self.duqlqnet.optimizer.apply_gradients(zip(grads, variables)) #: Update policy with tf.GradientTape() as tape: selected_actions, logprobs = self.policy.sample_action(states) q1, q2 = self.duqlqnet(states, selected_actions) q_min = tf.minimum(q1, q2) loss = -1 * tf.reduce_mean(q_min + -1 * alpha * logprobs) variables = self.policy.trainable_variables grads = tape.gradient(loss, variables) self.policy.optimizer.apply_gradients(zip(grads, variables)) #: Adjust alpha entropy_diff = -1 * logprobs - self.target_entropy with tf.GradientTape() as tape: tape.watch(self.log_alpha) selected_actions, logprobs = self.policy.sample_action(states) alpha_loss = tf.reduce_mean(tf.exp(self.log_alpha) * entropy_diff) grad = tape.gradient(alpha_loss, self.log_alpha) self.alpha_optimizer.apply_gradients([(grad, self.log_alpha)]) #: Soft target update self.target_dualqnet.set_weights( (1 - self.TAU) * np.array(self.target_dualqnet.get_weights()) + self.TAU * np.array(self.duqlqnet.get_weights()) ) def save_model(self): self.policy.save_weights("checkpoints/actor") self.duqlqnet.save_weights("checkpoints/critic") def load_model(self): self.policy.load_weights("checkpoints/actor") self.duqlqnet.load_weights("checkpoints/critic") self.target_dualqnet.load_weights("checkpoints/critic") def testplay(self, n=1, monitordir=None): if monitordir: env = wrappers.Monitor(gym.make(self.env_id), monitordir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_id) total_rewards = [] for _ in range(n): state = env.reset() done = False total_reward = 0 while not done: action, _ = self.policy.sample_action(np.atleast_2d(state)) action = action.numpy()[0] next_state, reward, done, _ = env.step(action) total_reward += reward if done: break else: state = next_state total_rewards.append(total_reward) print() print(total_reward) print() return total_rewards
if args.train: # training loop for eps in range(max_episodes): state = env.reset() episode_reward = 0 for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, device=device) else: action = sac_trainer.policy_net.sample_action() next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 if len(replay_buffer) > batch_size: for i in range(update_itr): _=sac_trainer.update(batch_size, device, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) if done: break if eps % 20 == 0 and eps>0: # plot and model saving interval plot(rewards)
def main(): env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", worker_id=1, seed=1) env_date = str(datetime.datetime.now()) file_path = os.path.join('data', env_date) os.makedirs(file_path, exist_ok=True) save_config(file_path) brain_name = env.brain_names[0] buffer = ReplayBuffer(Config.buffer_size) maddpg = MADDPGUnity(cfg=Config, tau=Config.tau, discount_factor=Config.discount_factor, checkpoint_path=Config.checkpoint_path) agent1_reward, agent0_reward, all_rewards_mean = [], [], [] batchsize = Config.batchsize max_reward = Config.max_reward # amplitude of OU noise # this slowly decreases to 0 noise = Config.noise_beginning logger = logging.getLogger('Tennis MADDPG') all_rewards = [] for episode in range(Config.n_episodes): reward_this_episode = np.zeros(2) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) n_of_steps = 0 noise = max( Config.min_noise, Config.noise_beginning * (1 - (Config.n_episodes - episode) / Config.n_episodes)) while True: n_of_steps += 1 states_tensor = list(map(torch.tensor, states)) states_tensor = [a.float() for a in states_tensor] actions = maddpg.act(states_tensor, noise=noise) actions_array = torch.stack(actions).detach().numpy() actions_for_env = np.rollaxis(actions_array, 1) actions_for_env = np.clip(actions_for_env, -1, 1) # all actions between -1 and 1 env_info = env.step(actions_for_env)[ brain_name] # send all actions to tne environment states_next = env_info.vector_observations # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward reward_this_episode += np.array(env_info.rewards) if Config.replay_buffer_raward_min and max( reward_this_episode) >= Config.replay_buffer_raward_min: buffer_data = (states, actions_for_env, env_info.rewards, states_next, env_info.local_done) buffer.push(buffer_data) if not Config.replay_buffer_raward_min: buffer_data = (states, actions_for_env, env_info.rewards, states_next, env_info.local_done) buffer.push(buffer_data) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) states = states_next # roll over states to next time step if np.any(dones): # exit loop if episode finished break all_rewards.append(max(reward_this_episode[0], reward_this_episode[1])) all_rewards_mean.append(np.mean(all_rewards[-100:])) agent0_reward.append(reward_this_episode[0]) agent1_reward.append(reward_this_episode[1]) if len(buffer) > Config.warmup: for i in range(2): samples = buffer.sample(batchsize) maddpg.update(samples, i, logger) if episode % Config.update_episode_n == 0: maddpg.update_targets( ) # soft update the target network towards the actual networks maddpg.iter += 1 if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1: logger.info( f'Average 0 reward of agent0 is {np.mean(agent0_reward)}') logger.info( f'Average 1 reward of agent1 is {np.mean(agent1_reward)}') if all_rewards_mean and all_rewards_mean[-1] > max_reward: max_reward = max(np.mean(agent0_reward), np.mean(agent1_reward)) logger.info('Found best model. Saving model into file: ...') save_dict_list = [] for i in range(2): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_target_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(file_path, 'episode-{}.pt'.format(episode))) agent0_reward = [] agent1_reward = [] plt.plot(all_rewards_mean) plt.savefig(os.path.join(file_path, 'result_plot.png'))
def main(): seeding() # number of parallel agents parallel_envs = 4 # number of training episodes. # change this to higher number to experiment. say 30000. number_of_episodes = 10000 episode_length = 100 batchsize = 1000 # how many episodes to save policy and gif save_interval = 5000 # what is this ? t = 0 # amplitude of OU noise # this slowly decreases to 0 noise = 2 noise_reduction = 0.9999 # how many episodes before update episode_per_update = 2 * parallel_envs log_path = os.getcwd() + "/log" model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) torch.set_num_threads(parallel_envs) # this may be a list of all environments env = envs.make_parallel_env(parallel_envs) # keep 5000 episodes worth of replay buffer = ReplayBuffer(int(5000 * episode_length)) # initialize policy and critic # this creates a list of models, each element in the list refers to an agent in the simulation # [agent_one_ddpg, agent_two_ddpg, ...] # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic maddpg = MADDPG() logger = SummaryWriter(log_dir=log_path) agent0_reward = [] agent1_reward = [] agent2_reward = [] # training loop # show progressbar import progressbar as pb widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() # use keep_awake to keep workspace from disconnecting # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)): # notice we jump forward by number of parallel environments for episode in range(0, number_of_episodes, parallel_envs): timer.update(episode) # i believe there are as many as number of agents times parallel env reward reward_this_episode = np.zeros((parallel_envs, 3)) # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # all_observation = array(number of environments 4, 2 elements) # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14 # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements all_obs = env.reset() # obs : is a list that has 1 element per environment. each element contains a list of 3 array. # each array is the state of one agent in that environment. # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment # each element contains an array of 14 values which is the global state of that environment obs, obs_full = transpose_list(all_obs) #for calculating rewards for this particular episode - addition of all time steps # save info or not save_info = (episode % save_interval < parallel_envs or episode == number_of_episodes - parallel_envs) frames = [] tmax = 0 if save_info: frames.append(env.render('rgb_array')) for episode_t in range(episode_length): # we finish the episode before sampling the buffer for trainint # t jumps forward in a multiple of environment t += parallel_envs # explore = only explore for a certain number of episodes # action input needs to be transposed # the transpose_to_tensor(obs) changes the data to each agent point of view # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3 # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments. # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and # to generate an action from each agent actor. actions = maddpg.act(transpose_to_tensor(obs), noise=noise) noise *= noise_reduction # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct # actions_array is a tensor of shape (3 agent, 4 env, 2 action) actions_array = torch.stack(actions).detach().numpy() # transpose the list of list # flip the first two indices # input to step requires the first index to correspond to number of parallel agents # the shape of actions_for_env is (4 env, 3 agent, 2 action) actions_for_env = np.rollaxis(actions_array, 1) # step forward one frame # obs is the observation state space of all the three agents in the 4 parallel env. # for the Physical Dception environment with three agents it is of dimension 4x3x14. # obs_full is world state irrespective of the agents and its dimension is 4x14. # To gain more understanding, please see the code in the multiagent folder. next_obs, next_obs_full, rewards, dones, info = env.step( actions_for_env) # add data to buffer transition = (obs, obs_full, actions_for_env, rewards, next_obs, next_obs_full, dones) buffer.push(transition) reward_this_episode += rewards obs, obs_full = next_obs, next_obs_full # save gif frame if save_info: frames.append(env.render('rgb_array')) tmax += 1 # update once after every episode_per_update if len(buffer ) > batchsize and episode % episode_per_update < parallel_envs: for a_i in range(3): # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which # reward and actions belong to which agent # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done # each element of sample, say samples[0] is a list of 3 elements, one for each agent # each agent element contains their corresponding value, for example in case of obs it would be a # vector with 14 values # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards samples = buffer.sample(batchsize) maddpg.update(samples, a_i, logger) maddpg.update_targets( ) #soft update the target network towards the actual networks for i in range(parallel_envs): agent0_reward.append(reward_this_episode[i, 0]) agent1_reward.append(reward_this_episode[i, 1]) agent2_reward.append(reward_this_episode[i, 2]) if episode % 100 == 0 or episode == number_of_episodes - 1: avg_rewards = [ np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward) ] agent0_reward = [] agent1_reward = [] agent2_reward = [] for a_i, avg_rew in enumerate(avg_rewards): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode) #saving model save_dict_list = [] if save_info: for i in range(3): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(model_dir, 'episode-{}.pt'.format(episode))) # save gif files imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)), frames, duration=.04) env.close() logger.close() timer.finish()
class CategoricalDQNAgent: def __init__(self, env_name="BreakoutDeterministic-v4", n_atoms=51, Vmin=-10, Vmax=10, gamma=0.98, n_frames=4, batch_size=32, lr=0.00025, init_epsilon=0.95, update_period=8, target_update_period=10000): self.env_name = env_name self.n_atoms = n_atoms self.Vmin, self.Vmax = Vmin, Vmax self.delta_z = (self.Vmax - self.Vmin) / (self.n_atoms - 1) self.Z = np.linspace(self.Vmin, self.Vmax, self.n_atoms) self.gamma = gamma self.n_frames = n_frames self.batch_size = batch_size self.init_epsilon = init_epsilon self.epsilon_scheduler = ( lambda steps: max(0.98 * (500000 - steps) / 500000, 0.1) if steps < 500000 else max( 0.05 + 0.05 * (1000000 - steps) / 500000, 0.05)) self.update_period = update_period self.target_update_period = target_update_period env = gym.make(self.env_name) self.action_space = env.action_space.n self.qnet = CategoricalQNet(self.action_space, self.n_atoms, self.Z) self.target_qnet = CategoricalQNet(self.action_space, self.n_atoms, self.Z) self.optimizer = tf.keras.optimizers.Adam(lr=lr, epsilon=0.01 / batch_size) def learn(self, n_episodes, buffer_size=800000, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) self.replay_buffer = ReplayBuffer(max_len=buffer_size) steps = 0 for episode in range(1, n_episodes + 1): env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) #: ネットワーク重みの初期化 state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.target_qnet(state) self.target_qnet.set_weights(self.qnet.get_weights()) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: steps += 1 episode_steps += 1 epsilon = self.epsilon_scheduler(steps) state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(frame_preprocess(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if done: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) break else: if info["ale.lives"] != lives: lives = info["ale.lives"] exp = Experience(state, action, reward, next_state, True) else: exp = Experience(state, action, reward, next_state, done) self.replay_buffer.push(exp) if (len(self.replay_buffer) > 20000) and (steps % self.update_period == 0): loss = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=steps) tf.summary.scalar("epsilon", epsilon, step=steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=steps) tf.summary.scalar("train_score", episode_rewards, step=steps) tf.summary.scalar("train_steps", episode_steps, step=steps) #: Hard target update if steps % self.target_update_period == 0: self.target_qnet.set_weights(self.qnet.get_weights()) print( f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}" ) if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=steps) tf.summary.scalar("test_step", test_steps[0], step=steps) if episode % 1000 == 0: print("Model Saved") self.qnet.save_weights("checkpoints/qnet") def update_network(self): #: ミニバッチの作成 (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size) next_actions, next_probs = self.target_qnet.sample_actions(next_states) #: 選択されたactionの確率分布だけ抽出する onehot_mask = self.create_mask(next_actions) next_dists = tf.reduce_sum(next_probs * onehot_mask, axis=1).numpy() #: 分布版ベルマンオペレータの適用 target_dists = self.shift_and_projection(rewards, dones, next_dists) onehot_mask = self.create_mask(actions) with tf.GradientTape() as tape: probs = self.qnet(states) dists = tf.reduce_sum(probs * onehot_mask, axis=1) #: クリップしないとlogとったときに勾配爆発することがある dists = tf.clip_by_value(dists, 1e-6, 1.0) loss = tf.reduce_sum(-1 * target_dists * tf.math.log(dists), axis=1, keepdims=True) loss = tf.reduce_mean(loss) grads = tape.gradient(loss, self.qnet.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.qnet.trainable_variables)) return loss def shift_and_projection(self, rewards, dones, next_dists): target_dists = np.zeros((self.batch_size, self.n_atoms)) for j in range(self.n_atoms): tZ_j = np.minimum( self.Vmax, np.maximum(self.Vmin, rewards + self.gamma * self.Z[j])) bj = (tZ_j - self.Vmin) / self.delta_z lower_bj = np.floor(bj).astype(np.int8) upper_bj = np.ceil(bj).astype(np.int8) eq_mask = lower_bj == upper_bj neq_mask = lower_bj != upper_bj lower_probs = 1 - (bj - lower_bj) upper_probs = 1 - (upper_bj - bj) next_dist = next_dists[:, [j]] indices = np.arange(self.batch_size).reshape(-1, 1) target_dists[indices[neq_mask], lower_bj[neq_mask]] += (lower_probs * next_dist)[neq_mask] target_dists[indices[neq_mask], upper_bj[neq_mask]] += (upper_probs * next_dist)[neq_mask] target_dists[indices[eq_mask], lower_bj[eq_mask]] += (0.5 * next_dist)[eq_mask] target_dists[indices[eq_mask], upper_bj[eq_mask]] += (0.5 * next_dist)[eq_mask] """ 2. doneへの対処 doneのときは TZ(t) = R(t) """ for batch_idx in range(self.batch_size): if not dones[batch_idx]: continue else: target_dists[batch_idx, :] = 0 tZ = np.minimum(self.Vmax, np.maximum(self.Vmin, rewards[batch_idx])) bj = (tZ - self.Vmin) / self.delta_z lower_bj = np.floor(bj).astype(np.int32) upper_bj = np.ceil(bj).astype(np.int32) if lower_bj == upper_bj: target_dists[batch_idx, lower_bj] += 1.0 else: target_dists[batch_idx, lower_bj] += 1 - (bj - lower_bj) target_dists[batch_idx, upper_bj] += 1 - (upper_bj - bj) return target_dists def create_mask(self, actions): mask = np.ones((self.batch_size, self.action_space, self.n_atoms)) actions_onehot = tf.one_hot(tf.cast(actions, tf.int32), self.action_space, axis=1) for idx in range(self.batch_size): mask[idx, ...] = mask[idx, ...] * actions_onehot[idx, ...] return mask def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frames = collections.deque(maxlen=4) frame = frame_preprocess(env.reset()) for _ in range(self.n_frames): frames.append(frame) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=0.1) next_frame, reward, done, info = env.step(action) frames.append(frame_preprocess(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps