def train_agent(rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_memo, max_step, max_total_step, eval_times1, eval_times2, gpu_id, show_gap, if_stop, **_kwargs): # 2020-06-01 env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env( env_name, is_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) # todo eva_size1 agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() is_online_policy = bool( rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'}) if is_online_policy: buffer = BufferTupleOnline(max_memo) else: buffer = BufferArray(max_memo, state_dim, 1 if is_discrete else action_dim) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # if loss_c > 4: # todo backtracking # agent.save_or_load_model(cwd, if_save=False) '''saves the agent with max reward''' with torch.no_grad(): # for saving the GPU buffer recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, max_action, agent.device, is_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None recorder.save_npy__plot_png(cwd) if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap) '''break loop rules''' if_train = not ((if_stop and if_solve) or recorder.total_step > max_total_step or os.path.exists(f'{cwd}/stop.mark')) recorder.save_npy__plot_png(cwd)
def train_agent(rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step, batch_size, repeat_times, reward_scale, gamma, break_step, if_break_early, show_gap, eval_times1, eval_times2, **_kwargs): # 2020-09-18 env, state_dim, action_dim, target_reward, if_discrete = build_gym_env( env_name, if_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() if_online_policy = bool( rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'}) if if_online_policy: buffer = BufferTupleOnline(max_memo) else: buffer = BufferArray(max_memo, state_dim, 1 if if_discrete else action_dim) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''pre training and hard update before training loop''' buffer.init_before_sample() agent.update_parameters(buffer, max_step, batch_size, repeat_times) agent.act_target.load_state_dict(agent.act.state_dict()) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' with torch.no_grad(): # for saving the GPU buffer recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, agent.device, if_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None recorder.save_npy__plot_png(cwd) if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap) '''break loop rules''' if_train = not ((if_break_early and if_solve) or recorder.total_step > break_step or os.path.exists(f'{cwd}/stop')) recorder.save_npy__plot_png(cwd) buffer.print_state_norm(env.neg_state_avg, env.div_state_std)
def mp__update_params( args, q_i_eva, q_o_eva): # 2020-11-11 update network parameters using replay buffer rl_agent = args.rl_agent max_memo = args.max_memo net_dim = args.net_dim max_step = args.max_step max_total_step = args.break_step batch_size = args.batch_size repeat_times = args.repeat_times cwd = args.cwd env_name = args.env_name reward_scale = args.reward_scale if_stop = args.if_break_early gamma = args.gamma del args env, state_dim, action_dim, target_reward, if_discrete = build_env( env_name, if_print=False) '''build agent''' agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() '''send agent to q_i_eva''' from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_eva.put(act_cpu) # q_i_eva 1. '''build replay buffer, init: total_step, reward_avg''' total_step = 0 if bool(rl_agent.__name__ in { 'AgentPPO', }): buffer = BufferTupleOnline(max_memo) with torch.no_grad(): reward_avg = get_total_return(env, act_cpu, max_step, torch.device("cpu"), if_discrete) elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}): buffer = BufferArrayGPU(max_memo + max_step, state_dim, action_dim, if_ppo=True) # experiment replay buffer with torch.no_grad(): reward_avg = get_total_return(env, act_cpu, max_step, torch.device("cpu"), if_discrete) else: buffer = BufferArrayGPU(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False) '''initial exploration''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) reward_avg = np.average(rewards) step_sum = sum(steps) '''pre training and hard update before training loop''' buffer.update_pointer_before_sample() agent.update_policy(buffer, max_step, batch_size, repeat_times) if 'act_target' in dir(agent): agent.act_target.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva n. total_step += step_sum '''training loop''' if_train = True if_solve = False while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) reward_avg = np.average(rewards) if len(rewards) else reward_avg step_sum = sum(steps) total_step += step_sum '''update network parameters by random sampling buffer for gradient descent''' buffer.update_pointer_before_sample() loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' act_cpu.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: if_solve = q_o_eva.get() # q_o_eva n. '''break loop rules''' if_train = not ((if_stop and if_solve) or total_step > max_total_step or os.path.exists(f'{cwd}/stop')) env, state_dim, action_dim, target_reward, if_discrete = build_env( env_name, if_print=False) buffer.print_state_norm(env.neg_state_avg, env.div_state_std) q_i_eva.put('stop') while q_i_eva.qsize() > 0: time.sleep(1) time.sleep(4)