def train_agent_discrete( class_agent, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, update_gap, reward_scale, **_kwargs): # 2020-05-20 env = gym.make(env_name) '''init''' state_dim, action_dim, action_max, target_reward = get_env_info( env, is_print=True) assert isinstance(action_max, int) # means Discrete action space agent = class_agent(env, state_dim, action_dim, net_dim) # training agent buffer = BufferArray(max_memo, state_dim, action_dim=1) # experiment replay buffer recorder = Recorder(agent, max_step, action_max, target_reward, env_name, **_kwargs) '''loop''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, action_max, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, 0, 0) try: for epoch in range(max_epoch): '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, action_max, reward_scale, gamma) '''update network parameters by random sampling buffer for stochastic gradient descent''' loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, update_gap) '''show/check the reward, save the max reward actor''' with torch.no_grad(): # for saving the GPU buffer '''NOTICE! Recorder saves the agent with max reward automatically. ''' recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.print_and_save_npy(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent_sac( agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, reward_scale, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(env, state_dim, action_dim, net_dim) memo = MemoryArray(max_memo, state_dim, action_dim) recorder = Recorder(agent, max_step, max_action, target_reward, env_name, show_gap=2**6) uniform_exploration(env, max_step, max_action, gamma, reward_scale, memo, action_dim) try: for epoch in range(max_epoch): with torch.no_grad(): # for saving the GPU memory rewards, steps = agent.update_memory(env, memo, max_step, max_action, reward_scale, gamma) loss_a, loss_c = agent.update_parameter(memo, max_step, batch_size) with torch.no_grad(): # for saving the GPU memory recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_epoch, **_kwargs): # 2020-06-01 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) '''init''' agent = class_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() buffer = BufferArray(max_memo, state_dim, action_dim=1 if is_discrete else action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) # unnecessary '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, loss_a=0, loss_c=0) try: for epoch in range(max_epoch): # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) # update network parameters by random sampling buffer for gradient descent buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("| raise KeyboardInterrupt and break training loop") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)
def train_agent(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, update_gap, gamma, exp_noise, pol_noise, reward_scale, # update **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) memo_action_dim = action_dim if max_action else 1 # Discrete action space memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim) memo.save_or_load_memo(cwd, is_save=False) recorder = Recorder(agent, max_step, max_action, target_reward, env_name) r_norm = RewardNormalization(n_max=target_reward, n_min=recorder.reward_avg, size=reward_scale) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps = agent.inactive_in_env( env, memo, max_step, exp_noise, max_action, r_norm) memo.refresh_indices() actor_loss, critic_loss = agent.update_parameter( memo, sum(steps), batch_size, pol_noise, update_gap, gamma) if np.isnan(actor_loss) or np.isnan(critic_loss): print("ValueError: loss value should not be 'nan'. Please run again.") return False with torch.no_grad(): # just the GPU memory # is_solved = recorder.show_and_check_reward( # epoch, epoch_reward, iter_num, actor_loss, critic_loss, cwd) recorder.show_reward(epoch, rewards, steps, actor_loss, critic_loss) is_solved = recorder.check_reward(cwd, actor_loss, critic_loss) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent_ppo(agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) # memo_action_dim = action_dim if max_action else 1 # Discrete action space # memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim) # memo.save_or_load_memo(cwd, is_save=False) state_norm = AutoNormalization((state_dim,), clip=6.0) recorder = Recorder(agent, max_step, max_action, target_reward, env_name, state_norm=state_norm) # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps, memory = agent.inactive_in_env_ppo( env, max_step, max_memo, max_action, state_norm) l_total, l_value = agent.update_parameter_ppo( memory, batch_size, gamma, ep_ratio=1 - epoch / max_epoch) if np.isnan(l_total) or np.isnan(l_value): print("ValueError: loss value should not be 'nan'. Please run again.") return False with torch.no_grad(): # for saving the GPU memory recorder.show_reward(epoch, rewards, steps, l_value, l_total) is_solved = recorder.check_reward(cwd, l_value, l_total) if is_solved: print(';;;', is_solved) break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent_ppo(class_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_epoch, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) agent = class_agent(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) recorder = Recorder(agent, max_step, max_action, target_reward, env_name) # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg) # running_state = ZFilter((state_dim,), clip=5.0) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps, memory = agent.inactive_in_env_ppo( env, max_step, max_memo, max_action, reward_scale, gamma) loss_a, loss_c = agent.update_parameter_ppo( memory, batch_size, repeat_times) with torch.no_grad(): # just the GPU memory recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print( "AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again." ) return False train_time = recorder.print_and_save_npy(env_name, cwd) draw_plot_with_npy(cwd, train_time) return True
def process__workers(gpu_id, root_cwd, q_aggr, q_dist, args, **_kwargs): class_agent = args.class_agent env_name = args.env_name cwd = args.cwd net_dim = args.net_dim max_step = args.max_step # max_memo = args.max_memo max_epoch = args.max_epoch batch_size = args.batch_size * 1.5 gamma = args.gamma update_gap = args.update_gap reward_scale = args.reward_scale cwd = '{}/{}_{}'.format(root_cwd, cwd, gpu_id) os.makedirs(cwd, exist_ok=True) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) random_seed = 42 + gpu_id np.random.seed(random_seed) torch.manual_seed(random_seed) torch.set_default_dtype(torch.float32) torch.set_num_threads(8) env = gym.make(env_name) is_solved = False class BufferArrayMP(BufferArray): def init_before_sample(self): q_aggr.put((self.memories, is_solved)) # self.now_len = self.max_len if self.is_full else self.next_idx def random_sample(self, _batch_size, device=None): batch_arrays = q_dist.get() '''convert array into torch.tensor''' tensors = [ torch.tensor(ary, device=device) for ary in batch_arrays ] return tensors '''init''' state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=True) agent = class_agent(env, state_dim, action_dim, net_dim) # training agent buffer = BufferArrayMP(max_step, state_dim, action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) '''loop''' # with torch.no_grad(): # update replay buffer # # rewards, steps = agent.update_buffer( # # env, buffer, max_step, max_action, reward_scale, gamma) # rewards, steps = initial_exploration( # env, buffer, max_step, max_action, reward_scale, gamma, action_dim) # recorder.show_reward(rewards, steps, 0, 0) try: for epoch in range(max_epoch): '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) '''update network parameters by random sampling buffer for stochastic gradient descent''' loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, update_gap) '''show/check the reward, save the max reward actor''' with torch.no_grad(): # for saving the GPU buffer '''NOTICE! Recorder saves the agent with max reward automatically. ''' recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.print_and_save_npy(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent_ppo(class_agent, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, net_dim, max_memo, max_epoch, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) '''default hyper-parameters for on-policy RL algorithm PPO max_memo = 2 ** 11 repeat_times = 2 ** 3 batch_size = 2 ** 8 net_dim = 2 ** 7 gamma = 0.99 env_name = "LunarLanderContinuous-v2" env_name = "BipedalWalker-v3" ''' '''init''' agent = class_agent(state_dim, action_dim, net_dim) buffer = BufferListPPO( ) # on policy algorithm. Generalization Advantage Estimate. ICLR. 2016. state_norm = AutoNormalization( (state_dim, ), clip=6.0) # on policy algorithm can do normalization for state recorder = Recorder(agent, max_step, max_action, target_reward, env_name, state_norm=state_norm, **_kwargs) try: for epoch in range(max_epoch): # on policy algorithm refresh replay buffer for each parameters update buffer.storage = list() # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer_ppo( env, buffer, max_step, max_memo, max_action, reward_scale, gamma, state_norm) # update network parameters by random sampling buffer for gradient descent loss_a, loss_c = agent.update_parameters_ppo( buffer, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)