def train_agent_ppo( agent_class, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, **_kwargs): # 2020-0430 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info(env) agent = agent_class(state_dim, action_dim, net_dim) agent.save_or_load_model(cwd, is_save=False) # memo_action_dim = action_dim if max_action else 1 # Discrete action space # memo = Memories(max_memo, memo_dim=1 + 1 + state_dim + memo_action_dim + state_dim) # memo.save_or_load_memo(cwd, is_save=False) state_norm = AutoNormalization((state_dim, ), clip=5.0) recorder = Recorder(agent, max_step, max_action, target_reward, env_name, state_norm=state_norm) # r_norm = RewardNorm(n_max=target_reward, n_min=recorder.reward_avg) try: for epoch in range(max_epoch): with torch.no_grad(): # just the GPU memory rewards, steps, memory = agent.inactive_in_env_ppo( env, max_step, max_memo, max_action, state_norm) l_total, l_value = agent.update_parameter_ppo(memory, batch_size, gamma, ep_ratio=1 - epoch / max_epoch) if np.isnan(l_total) or np.isnan(l_value): print( "ValueError: loss value should not be 'nan'. Please run again." ) return False with torch.no_grad(): # for saving the GPU memory recorder.show_reward(epoch, rewards, steps, l_value, l_total) is_solved = recorder.check_reward(cwd, l_value, l_total) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") except AssertionError: # for BipedWalker BUG 2020-03-03 print( "AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again." ) return False train_time = recorder.show_and_save(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # memo.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def process__workers(gpu_id, root_cwd, q_aggr, q_dist, args, **_kwargs): class_agent = args.class_agent env_name = args.env_name cwd = args.cwd net_dim = args.net_dim max_step = args.max_step # max_memo = args.max_memo max_epoch = args.max_epoch batch_size = args.batch_size * 1.5 gamma = args.gamma update_gap = args.update_gap reward_scale = args.reward_scale cwd = '{}/{}_{}'.format(root_cwd, cwd, gpu_id) os.makedirs(cwd, exist_ok=True) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) random_seed = 42 + gpu_id np.random.seed(random_seed) torch.manual_seed(random_seed) torch.set_default_dtype(torch.float32) torch.set_num_threads(8) env = gym.make(env_name) is_solved = False class BufferArrayMP(BufferArray): def init_before_sample(self): q_aggr.put((self.memories, is_solved)) # self.now_len = self.max_len if self.is_full else self.next_idx def random_sample(self, _batch_size, device=None): batch_arrays = q_dist.get() '''convert array into torch.tensor''' tensors = [torch.tensor(ary, device=device) for ary in batch_arrays] return tensors '''init''' state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=True) agent = class_agent(env, state_dim, action_dim, net_dim) # training agent buffer = BufferArrayMP(max_step, state_dim, action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) '''loop''' # with torch.no_grad(): # update replay buffer # # rewards, steps = agent.update_buffer( # # env, buffer, max_step, max_action, reward_scale, gamma) # rewards, steps = initial_exploration( # env, buffer, max_step, max_action, reward_scale, gamma, action_dim) # recorder.show_reward(rewards, steps, 0, 0) try: for epoch in range(max_epoch): '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) '''update network parameters by random sampling buffer for stochastic gradient descent''' loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, update_gap) '''show/check the reward, save the max reward actor''' with torch.no_grad(): # for saving the GPU buffer '''NOTICE! Recorder saves the agent with max reward automatically. ''' recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.print_and_save_npy(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True