parser.add_argument('--updates_per_step', type=int, default=1, metavar='N') parser.add_argument('--start_steps', type=int, default=10000, metavar='N') parser.add_argument('--target_update_interval', type=int, default=1, metavar='N') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N') args = parser.parse_args() args.cuda =True if torch.cuda.is_available() else False env = h_env.HockeyEnv(mode=h_env.HockeyEnv.NORMAL) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.load_model('full_player_models/sac_actor_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000','full_player_models/sac_critic_hockey_11200_batch_size-4_gamma-0.95_tau-0.005_lr-0.0003_alpha-0.2_tuning-True_hidden_size-256_updatesStep-1_startSteps-10000_targetIntervall-1_replaysize-1000000') # opponent = copy.deepcopy(agent) basic_strong = h_env.BasicOpponent(weak=False) time_ = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') #Tesnorboard writer = SummaryWriter(f"strongplay-runs/ERE{time_}_batch_size-{args.batch_size}_gamma-{args.gamma}_tau-{args.tau}_lr-{args.lr}_alpha-{args.alpha}_tuning-{args.automatic_entropy_tuning}_hidden_size-{args.hidden_size}_updatesStep-{args.updates_per_step}_startSteps-{args.start_steps}_targetIntervall-{args.target_update_interval}_replaysize-{args.replay_size}") # Memory memory = ERE_PrioritizedReplay(args.replay_size) # memory = ReplayMemory(args.replay_size,args.seed) # Training Loop total_numsteps = 0 updates = 0 o = env.reset() # _ = env.render()
parser.add_argument('--eval_episodes', help='Set number of evaluation episodes', type=int, default=30) parser.add_argument('--filename', help='Path to the pretrained model', default=None) parser.add_argument('--mode', help='Mode for evaluating currently: (shooting | defense)', default='normal') parser.add_argument('--show', help='Set if want to render training process', action='store_true') parser.add_argument('--q', help='Quiet mode (no prints)', action='store_true') parser.add_argument('--opposite', help='Evaluate agent on opposite side', action='store_true') opts = parser.parse_args() if __name__ == '__main__': if opts.mode == 'normal': mode = h_env.HockeyEnv_BasicOpponent.NORMAL elif opts.mode == 'shooting': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING elif opts.mode == 'defense': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE else: raise ValueError('Unknown training mode. See --help') logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) + '/logs', mode=opts.mode, quiet=opts.q) q_agent = logger.load_model(filename=opts.filename) q_agent._config['show'] = opts.show q_agent._config['max_steps'] = 250 q_agent.eval() env = h_env.HockeyEnv(mode=mode) opponent = h_env.BasicOpponent(weak=False) evaluate(agent=q_agent, env=env, opponent=opponent, eval_episodes=opts.eval_episodes, action_mapping=q_agent.action_mapping, evaluate_on_opposite_side=opts.opposite)
help='Evaluate agent vs weak basic opponent', default=False, action='store_true') opts = parser.parse_args() if __name__ == '__main__': if opts.mode == 'normal': mode = h_env.HockeyEnv_BasicOpponent.NORMAL elif opts.mode == 'shooting': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING elif opts.mode == 'defense': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE else: raise ValueError('Unknown training mode. See --help.') if opts.filename is None: raise ValueError('Parameter --filename must be present. See --help.') env = h_env.HockeyEnv(mode=mode) agent = SACAgent.load_model(opts.filename) agent.eval() agent._config['show'] = opts.show opponent = h_env.BasicOpponent(weak=opts.weak) evaluate(agent, env, opponent, opts.eval_episodes, evaluate_on_opposite_side=opts.opposite)
raise ValueError('Unknown training mode. See --help') opts.device = torch.device( 'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu') dirname = time.strftime(f'%y%m%d_%H%M%S_{random.randint(0, 1e6):06}', time.gmtime(time.time())) abs_path = os.path.dirname(os.path.realpath(__file__)) logger = Logger(prefix_path=os.path.join(abs_path, dirname), mode=opts.mode, cleanup=True, quiet=opts.q) env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q)) opponents = [ h_env.BasicOpponent(weak=True), ] # Add absolute paths for pretrained agents pretrained_agents = [] if opts.selfplay: for p in pretrained_agents: a = SACAgent.load_model(p) a.eval() opponents.append(a) if opts.preload_path is None: agent = SACAgent(logger=logger, obs_dim=env.observation_space.shape, action_space=env.action_space,
def train(self, agent, env): epsilon = self._config['epsilon'] epsilon_decay = self._config['epsilon_decay'] min_epsilon = self._config['min_epsilon'] episode_counter = 1 total_step_counter = 0 total_grad_updates = 0 beta = self._config['per_beta'] beta_inc = self._config['per_beta_inc'] beta_max = self._config['per_beta_max'] rew_stats = [] loss_stats = [] lost_stats = {} touch_stats = {} won_stats = {} eval_stats = {'reward': [], 'touch': [], 'won': [], 'lost': []} opponents = [ h_env.BasicOpponent(weak=True), h_env.BasicOpponent(weak=False) ] while episode_counter <= self._config['max_episodes']: if self._config['self_play']: opponent = poll_opponent(opponents=opponents) else: opponent = h_env.BasicOpponent(weak=False) ob = env.reset() obs_agent2 = env.obs_agent_two() if (env.puck.position[0] < 5 and self._config['mode'] == 'defense') or (env.puck.position[0] > 5 and self._config['mode'] == 'shooting'): continue epsilon = max(epsilon - epsilon_decay, min_epsilon) if self._config['per']: beta = min(beta_max, beta + beta_inc) agent.update_per_beta(beta=beta) total_reward = 0 touched = 0 first_time_touch = 1 touch_stats[episode_counter] = 0 won_stats[episode_counter] = 0 lost_stats[episode_counter] = 0 for step in range(1, self._config['max_steps'] + 1): a1 = agent.act(ob, eps=epsilon) a1_list = agent.action_mapping[a1] if self._config['mode'] in ['defense', 'normal']: a2 = opponent.act(obs_agent2) # a copy of our agent has been chosen, transform the action id to a list if not isinstance(a2, np.ndarray): a2 = agent.action_mapping[a2] elif self._config['mode'] == 'shooting': a2 = [0, 0, 0, 0] else: raise NotImplementedError( f'Training for {self._config["mode"]} not implemented.' ) (ob_new, reward, done, _info) = env.step(np.hstack([a1_list, a2])) touched = max(touched, _info['reward_touch_puck']) step_reward = reward + 5 * _info['reward_closeness_to_puck'] - (1 - touched) * 0.1 + \ touched * first_time_touch * 0.1 * step first_time_touch = 1 - touched total_reward += step_reward agent.store_transition((ob, a1, step_reward, ob_new, done)) if self._config['show']: time.sleep(0.01) env.render() if touched > 0: touch_stats[episode_counter] = 1 if done: won_stats[episode_counter] = 1 if env.winner == 1 else 0 lost_stats[episode_counter] = 1 if env.winner == -1 else 0 break if total_step_counter % self._config['train_every'] == 0 and \ total_step_counter > self._config['start_learning_from']: loss_stats.append(agent.train_model()) rew_stats.append(total_reward) total_grad_updates += 1 if total_grad_updates % self._config[ 'update_target_every'] == 0: agent.update_target_net() if self._config['self_play'] and total_grad_updates % self._config['add_opponent_every'] == 0 and \ episode_counter >= self._config['start_self_play_from']: opponents.append(deepcopy(agent)) agent.id += 1 ob = ob_new obs_agent2 = env.obs_agent_two() total_step_counter += 1 self.logger.print_episode_info(env.winner, episode_counter, step, total_reward, epsilon, touched, opponent) if episode_counter % self._config['evaluate_every'] == 0: self.logger.info("Evaluating agent") agent.eval() old_show = agent._config['show'] agent._config['show'] = False rew, touch, won, lost = evaluate( agent=agent, env=env, opponent=h_env.BasicOpponent(weak=False), eval_episodes=self._config['eval_episodes'], quiet=True, action_mapping=agent.action_mapping) agent.train() agent._config['show'] = old_show eval_stats['reward'].append(rew) eval_stats['touch'].append(touch) eval_stats['won'].append(won) eval_stats['lost'].append(lost) self.logger.save_model(agent, f'a-{episode_counter}.pkl') if total_step_counter > self._config['start_learning_from']: agent.step_lr_scheduler() episode_counter += 1 if self._config['show']: env.close() # Print train stats self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats) self.logger.info('Saving statistics...') # Plot reward self.logger.plot_running_mean(rew_stats, 'Total reward', 'total-reward.pdf', show=False) # Plot loss self.logger.plot_running_mean(loss_stats, 'Loss', 'loss.pdf', show=False) # Plot evaluation stats self.logger.plot_intermediate_stats(eval_stats, show=False) # Save model self.logger.save_model(agent, 'agent.pkl') # Save arrays of won-lost stats self.logger.save_array(data=eval_stats["won"], filename="eval-won-stats") self.logger.save_array(data=eval_stats["lost"], filename="eval-lost-stats")
def train(self, agent, opponents, env, run_evaluation): rew_stats, q1_losses, q2_losses, actor_losses, alpha_losses = [], [], [], [], [] lost_stats, touch_stats, won_stats = {}, {}, {} eval_stats = { 'weak': { 'reward': [], 'touch': [], 'won': [], 'lost': [] }, 'strong': { 'reward': [], 'touch': [], 'won': [], 'lost': [] } } episode_counter = 1 total_step_counter = 0 grad_updates = 0 new_op_grad = [] while episode_counter <= self._config['max_episodes']: ob = env.reset() obs_agent2 = env.obs_agent_two() total_reward, touched = 0, 0 touch_stats[episode_counter] = 0 won_stats[episode_counter] = 0 lost_stats[episode_counter] = 0 opponent = utils.poll_opponent(opponents) first_time_touch = 1 for step in range(self._config['max_steps']): a1 = agent.act(ob) if self._config['mode'] == 'defense': a2 = opponent.act(obs_agent2) elif self._config['mode'] == 'shooting': a2 = np.zeros_like(a1) else: a2 = opponent.act(obs_agent2) actions = np.hstack([a1, a2]) next_state, reward, done, _info = env.step(actions) touched = max(touched, _info['reward_touch_puck']) step_reward = ( reward + 5 * _info['reward_closeness_to_puck'] - (1 - touched) * 0.1 + touched * first_time_touch * 0.1 * step ) first_time_touch = 1 - touched total_reward += step_reward agent.store_transition((ob, a1, step_reward, next_state, done)) if self._config['show']: time.sleep(0.01) env.render() if touched > 0: touch_stats[episode_counter] = 1 if done: won_stats[episode_counter] = 1 if env.winner == 1 else 0 lost_stats[episode_counter] = 1 if env.winner == -1 else 0 break ob = next_state obs_agent2 = env.obs_agent_two() total_step_counter += 1 if agent.buffer.size < self._config['batch_size']: continue for _ in range(self._config['grad_steps']): losses = agent.update_parameters(total_step_counter) grad_updates += 1 q1_losses.append(losses[0]) q2_losses.append(losses[1]) actor_losses.append(losses[2]) alpha_losses.append(losses[3]) # Add trained agent to opponents queue if self._config['selfplay']: if ( grad_updates % self._config['add_self_every'] == 0 ): new_opponent = SACAgent.clone_from(agent) new_opponent.eval() opponents.append(new_opponent) new_op_grad.append(grad_updates) agent.schedulers_step() self.logger.print_episode_info(env.winner, episode_counter, step, total_reward) if episode_counter % self._config['evaluate_every'] == 0: agent.eval() for eval_op in ['strong', 'weak']: ev_opponent = opponents[0] if eval_op == 'strong' else h_env.BasicOpponent(False) rew, touch, won, lost = evaluate( agent, env, ev_opponent, 100, quiet=True ) eval_stats[eval_op]['reward'].append(rew) eval_stats[eval_op]['touch'].append(touch) eval_stats[eval_op]['won'].append(won) eval_stats[eval_op]['lost'].append(lost) agent.train() self.logger.save_model(agent, f'a-{episode_counter}.pkl') rew_stats.append(total_reward) episode_counter += 1 if self._config['show']: env.close() # Print train stats self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats) self.logger.info('Saving training statistics...') # Plot reward self.logger.plot_running_mean(data=rew_stats, title='Total reward', filename='total-reward.pdf', show=False) # Plot evaluation stats self.logger.plot_evaluation_stats(eval_stats, self._config['evaluate_every'], 'evaluation-won-lost.pdf') # Plot losses for loss, title in zip([q1_losses, q2_losses, actor_losses, alpha_losses], ['Q1 loss', 'Q2 loss', 'Policy loss', 'Alpha loss']): self.logger.plot_running_mean( data=loss, title=title, filename=f'{title.replace(" ", "-")}.pdf', show=False, v_milestones=new_op_grad, ) # Save agent self.logger.save_model(agent, 'agent.pkl') if run_evaluation: agent.eval() agent._config['show'] = True evaluate(agent, env, h_env.BasicOpponent(weak=False), self._config['eval_episodes'])
elif opts.mode == 'shooting': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_SHOOTING elif opts.mode == 'defense': mode = h_env.HockeyEnv_BasicOpponent.TRAIN_DEFENSE else: raise ValueError('Unknown training mode. See --help') opts.device = torch.device( 'cuda' if opts.cuda and torch.cuda.is_available() else 'cpu') logger = Logger(prefix_path=os.path.dirname(os.path.realpath(__file__)) + '/logs', mode=opts.mode, cleanup=False, quiet=opts.q) opponents = [h_env.BasicOpponent(weak=True)] env = h_env.HockeyEnv(mode=mode, verbose=(not opts.q)) if opts.TD3agent: agent = TD3Agent(logger=logger, obs_dim=env.observation_space.shape, action_space=env.action_space, userconfig=vars(opts)) else: agent = DDPGAgent(logger=logger, obs_dim=env.observation_space.shape, action_space=env.action_space, userconfig=vars(opts)) trainer = DDPGTrainer(logger, vars(opts)) trainer.train(agent, opponents, env, opts.evaluate)
def training_loop(hyperparameters): print(f"Starting training with hyperparameters: {hyperparameters}") save_path = hyperparameters["save_path"] load_path = hyperparameters["load_path"] # create the save path and save hyperparameter configuration if not os.path.exists(save_path): os.mkdir(save_path) else: a = input("Warning, Directory already exists. Dou want to continue?") if a not in ["Y","y"]: raise Exception("Path already exists, please start with another path.") with open(save_path+ "/parameters.json", "w") as f: json.dump(hyperparameters, f) # general configurations state_dim=18 action_dim=4 max_action=1 iterations=hyperparameters["max_iterations"] batch_size=hyperparameters["batch_size"] max_episodes=hyperparameters["max_episodes"] train_mode = hyperparameters["train_mode"] closeness_factor=hyperparameters["closeness_factor"] c = closeness_factor # init the agent agent1 = TD3Agent([state_dim + action_dim, 256, 256, 1], [state_dim, 256, 256, action_dim], optimizer=hyperparameters["optimizer"], policy_noise=hyperparameters["policy_noise"], policy_noise_clip=hyperparameters["policy_noise_clip"], gamma=hyperparameters["gamma"], delay=hyperparameters["delay"], tau=hyperparameters["tau"], lr=hyperparameters["lr"], max_action=max_action, weight_decay=hyperparameters["weight_decay"]) # load the agent if given loaded_state=False if load_path: agent1.load(load_path) loaded_state=True # define opponent if hyperparameters["self_play"]: agent2=agent1 else: agent2 = h_env.BasicOpponent(weak=hyperparameters["weak_agent"]) # load enviroment and replaybuffer replay_buffer = ReplayBuffer(state_dim, action_dim) if train_mode == "defense": env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE) elif train_mode == "shooting": env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING) else: env = h_env.HockeyEnv() # add figure to plot later if hyperparameters["plot_performance"]: fig, (ax_loss, ax_reward) = plt.subplots(2) ax_loss.set_xlim(0, max_episodes) ax_loss.set_ylim(0, 20) ax_reward.set_xlim(0, max_episodes) ax_reward.set_ylim(-30, 20) with HiddenPrints(): # first sample enough data to start: obs_last = env.reset() for i in range(batch_size*100): a1 = env.action_space.sample()[:4] if not loaded_state else agent1.act(env.obs_agent_two()) a2 = agent2.act(env.obs_agent_two()) obs, r, d, info = env.step(np.hstack([a1,a2])) done = 1 if d else 0 replay_buffer.add(obs_last, a1, obs, r, done) obs_last=obs if d: obs_last = env.reset() print("Finished collection of data prior to training") # tracking of performance episode_critic_loss=[] episode_rewards=[] win_count=[] if not os.path.isfile(save_path + "/performance.csv"): pd.DataFrame(data={"Episode_rewards":[], "Episode_critic_loss":[], "Win/Loss":[]}).to_csv(save_path + "/performance.csv", sep=",", index=False) # Then start training for episode_count in range(max_episodes+1): obs_last = env.reset() total_reward=0 critic_loss=[] for i in range(iterations): # run the enviroment with HiddenPrints(): with torch.no_grad(): a1 = agent1.act(env.obs_agent_two()) + np.random.normal(loc=0, scale=hyperparameters["exploration_noise"], size=action_dim) a2 = agent2.act(env.obs_agent_two()) obs, r, d, info = env.step(np.hstack([a1,a2])) total_reward+=r done = 1 if d else 0 # mopify reward with cloeness to puck reward if hyperparameters["closeness_decay"]: c = closeness_factor *(1 - episode_count/max_episodes) newreward = r + c * info["reward_closeness_to_puck"] # add to replaybuffer replay_buffer.add(obs_last, a1, obs, newreward, done) obs_last=obs # sample minibatch and train states, actions, next_states, reward, done = replay_buffer.sample(batch_size) loss = agent1.train(states, actions, next_states, reward, done) critic_loss.append(loss.detach().numpy()) # if done, finish episode if d: episode_rewards.append(total_reward) episode_critic_loss.append(np.mean(critic_loss)) win_count.append(info["winner"]) print(f"Episode {episode_count} finished after {i} steps with a total reward of {total_reward}") # Online plotting if hyperparameters["plot_performance"] and episode_count>40 : ax_loss.plot(list(range(-1, episode_count-29)), moving_average(episode_critic_loss, 30), 'r-') ax_reward.plot(list(range(-1, episode_count-29)), moving_average(episode_rewards, 30), "r-") plt.draw() plt.pause(1e-17) break # Intermediate evaluation of win/loss and saving of model if episode_count % 500 ==0 and episode_count != 0: print(f"The agents win ratio in the last 500 episodes was {win_count[-500:].count(1)/500}") print(f"The agents loose ratio in the last 500 episodes was {win_count[-500:].count(-1)/500}") try: agent1.save(save_path) print("saved model") except Exception: print("Saving Failed model failed") pd.DataFrame(data={"Episode_rewards": episode_rewards[-500:], "Episode_critic_loss": episode_critic_loss[-500:], "Win/Loss": win_count[-500:]}).to_csv(save_path + "/performance.csv", sep=",", index=False, mode="a", header=False) print(f"Finished training with a final mean reward of {np.mean(episode_rewards[-500:])}") # plot the performance summary if hyperparameters["plot_performance_summary"]: try: fig, (ax1, ax2) = plt.subplots(2) x = list(range(len(episode_critic_loss))) coef = np.polyfit(x, episode_critic_loss,1) poly1d_fn = np.poly1d(coef) ax1.plot(episode_critic_loss) ax1.plot(poly1d_fn(list(range(len(episode_critic_loss))))) x = list(range(len(episode_rewards))) coef = np.polyfit(x, episode_rewards,1) poly1d_fn = np.poly1d(coef) ax2.plot(episode_rewards) ax2.plot(poly1d_fn(list(range(len(episode_rewards))))) fig.show() fig.savefig(save_path + "/performance.png", bbox_inches="tight") except: print("Failed saving figure")
def train(self, agent, opponents, env, eval): epsilon = self._config['eps'] epsilon_decay = self._config['epsilon_decay'] min_epsilon = self._config['min_epsilon'] iter_fit = self._config['iter_fit'] episode_counter = 1 total_step_counter = 0 rew_stats = [] loss_stats = [] lost_stats = {} touch_stats = {} won_stats = {} eval_stats = {'reward': [], 'touch': [], 'won': [], 'lost': []} while episode_counter <= self._config['max_episodes']: ob = env.reset() obs_agent2 = env.obs_agent_two() epsilon = max(epsilon_decay * epsilon, min_epsilon) total_reward = 0 touched = 0 touch_stats[episode_counter] = 0 won_stats[episode_counter] = 0 lost_stats[episode_counter] = 0 opponent = utils.poll_opponent(opponents) first_time_touch = 1 for step in range(self._config['max_steps']): if self._config['TD3agent']: a1 = agent.act(ob, noise=self._config['noise']) else: a1 = agent.act(ob, eps=epsilon) if self._config['mode'] == 'defense': a2 = opponent.act(obs_agent2) elif self._config['mode'] == 'shooting': a2 = [0, 0, 0, 0] else: a2 = opponent.act(obs_agent2) (ob_new, reward, done, _info) = env.step(np.hstack([a1, a2])) touched = max(touched, _info['reward_touch_puck']) current_reward = reward + 5 * _info[ 'reward_closeness_to_puck'] - ( 1 - touched ) * 0.1 + touched * first_time_touch * 0.1 * step total_reward += current_reward first_time_touch = 1 - touched agent.store_transition((ob, a1, current_reward, ob_new, done)) if self._config['show']: time.sleep(0.01) env.render() if touched > 0: touch_stats[episode_counter] = 1 if done: won_stats[episode_counter] = 1 if env.winner == 1 else 0 lost_stats[episode_counter] = 1 if env.winner == -1 else 0 break ob = ob_new obs_agent2 = env.obs_agent_two() total_step_counter += 1 loss_stats.extend( agent.train(iter_fit=iter_fit, total_step_counter=episode_counter)) rew_stats.append(total_reward) self.logger.print_episode_info(env.winner, episode_counter, step, total_reward, epsilon) if episode_counter % self._config['evaluate_every'] == 0: agent.eval() rew, touch, won, lost = evaluate( agent, env, h_env.BasicOpponent(weak=True), self._config['eval_episodes'], quiet=True) agent.train_mode() eval_stats['reward'].append(rew) eval_stats['touch'].append(touch) eval_stats['won'].append(won) eval_stats['lost'].append(lost) self.logger.save_model(agent, f'a-{episode_counter}.pk l') self.logger.plot_intermediate_stats(eval_stats, show=False) agent.schedulers_step() episode_counter += 1 if self._config['show']: env.close() # Print train stats self.logger.print_stats(rew_stats, touch_stats, won_stats, lost_stats) self.logger.info('Saving training statistics...') # Plot reward self.logger.plot_running_mean(rew_stats, 'Total reward', 'total-reward.pdf', show=False) # Plot evaluation stats self.logger.plot_intermediate_stats(eval_stats, show=False) # Plot loss self.logger.plot_running_mean(loss_stats, 'Loss', 'loss.pdf', show=False) # Save model self.logger.save_model(agent, 'agent.pkl') # Log rew histograms print(eval_stats['won']) if eval: agent.eval() agent._config['show'] = True evaluate(agent, env, h_env.BasicOpponent(weak=False), self._config['eval_episodes']) agent.train_mode()