def train_agent_sac( agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, reward_scale, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(env, state_dim, action_dim, net_dim) memo = MemoryArray(max_memo, state_dim, action_dim) recorder = Recorder(agent, max_step, max_action, target_reward, env_name, show_gap=2**6) uniform_exploration(env, max_step, max_action, gamma, reward_scale, memo, action_dim) try: for epoch in range(max_epoch): with torch.no_grad(): # for saving the GPU memory rewards, steps = agent.update_memory(env, memo, max_step, max_action, reward_scale, gamma) loss_a, loss_c = agent.update_parameter(memo, max_step, batch_size) with torch.no_grad(): # for saving the GPU memory recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, update_gap, gamma, exp_noise, pol_noise, reward_scale, # update **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) memo_action_dim = action_dim if max_action else 1 # Discrete action space memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim) memo.save_or_load_memo(cwd, is_save=False) recorder = Recorder(agent, max_step, max_action, target_reward, env_name) r_norm = RewardNormalization(n_max=target_reward, n_min=recorder.reward_avg, size=reward_scale) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps = agent.inactive_in_env( env, memo, max_step, exp_noise, max_action, r_norm) memo.refresh_indices() actor_loss, critic_loss = agent.update_parameter( memo, sum(steps), batch_size, pol_noise, update_gap, gamma) if np.isnan(actor_loss) or np.isnan(critic_loss): print("ValueError: loss value should not be 'nan'. Please run again.") return False with torch.no_grad(): # just the GPU memory # is_solved = recorder.show_and_check_reward( # epoch, epoch_reward, iter_num, actor_loss, critic_loss, cwd) recorder.show_reward(epoch, rewards, steps, actor_loss, critic_loss) is_solved = recorder.check_reward(cwd, actor_loss, critic_loss) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent_ppo(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) # memo_action_dim = action_dim if max_action else 1 # Discrete action space # memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim) # memo.save_or_load_memo(cwd, is_save=False) state_norm = AutoNormalization((state_dim,), clip=6.0) recorder = Recorder(agent, max_step, max_action, target_reward, env_name, state_norm=state_norm) # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps, memory = agent.inactive_in_env_ppo( env, max_step, max_memo, max_action, state_norm) l_total, l_value = agent.update_parameter_ppo( memory, batch_size, gamma, ep_ratio=1 - epoch / max_epoch) if np.isnan(l_total) or np.isnan(l_value): print("ValueError: loss value should not be 'nan'. Please run again.") return False with torch.no_grad(): # for saving the GPU memory recorder.show_reward(epoch, rewards, steps, l_value, l_total) is_solved = recorder.check_reward(cwd, l_value, l_total) if is_solved: print(';;;', is_solved) break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True