def train_agent(rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step, batch_size, repeat_times, reward_scale, gamma, break_step, if_break_early, show_gap, eval_times1, eval_times2, **_kwargs): # 2020-09-18 env, state_dim, action_dim, target_reward, if_discrete = build_gym_env( env_name, if_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) # todo eva_size1 agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() if_online_policy = bool( rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'}) if if_online_policy: buffer = BufferTupleOnline(max_memo) else: buffer = BufferArray(max_memo, state_dim, 1 if if_discrete else action_dim) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) # todo pre training and hard update before loop buffer.init_before_sample() agent.update_parameters(buffer, max_step, batch_size, repeat_times) agent.act_target.load_state_dict(agent.act.state_dict()) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' with torch.no_grad(): # for saving the GPU buffer recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, agent.device, if_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None recorder.save_npy__plot_png(cwd) if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap) '''break loop rules''' if_train = not ((if_break_early and if_solve) or recorder.total_step > break_step or os.path.exists(f'{cwd}/stop.mark')) recorder.save_npy__plot_png(cwd) buffer.print_state_norm(env.neg_state_avg, env.div_state_std) # todo norm para
def train_agent_discrete( class_agent, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, update_gap, reward_scale, **_kwargs): # 2020-05-20 env = gym.make(env_name) '''init''' state_dim, action_dim, action_max, target_reward = get_env_info( env, is_print=True) assert isinstance(action_max, int) # means Discrete action space agent = class_agent(env, state_dim, action_dim, net_dim) # training agent buffer = BufferArray(max_memo, state_dim, action_dim=1) # experiment replay buffer recorder = Recorder(agent, max_step, action_max, target_reward, env_name, **_kwargs) '''loop''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, action_max, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, 0, 0) try: for epoch in range(max_epoch): '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, action_max, reward_scale, gamma) '''update network parameters by random sampling buffer for stochastic gradient descent''' loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, update_gap) '''show/check the reward, save the max reward actor''' with torch.no_grad(): # for saving the GPU buffer '''NOTICE! Recorder saves the agent with max reward automatically. ''' recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.print_and_save_npy(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_epoch, **_kwargs): # 2020-06-01 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) '''init''' agent = class_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() buffer = BufferArray(max_memo, state_dim, action_dim=1 if is_discrete else action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) # unnecessary '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, loss_a=0, loss_c=0) try: for epoch in range(max_epoch): # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) # update network parameters by random sampling buffer for gradient descent buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("| raise KeyboardInterrupt and break training loop") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)
def train_offline_policy(rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_total_step, **_kwargs): # 2020-06-01 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) assert not is_discrete '''init: agent, buffer, recorder''' agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) # unnecessary '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, loss_a=0, loss_c=0) while True: # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) # update network parameters by random sampling buffer for gradient descent buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: print('Reach target_reward: ', target_reward, recorder.reward_max) break if recorder.total_step > max_total_step: print('Reach target_step: ', max_total_step, recorder.total_step) break train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)
def train_agent( rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_total_step, eva_size, gpu_id, show_gap, **_kwargs): # 2020-06-01 env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env(env_name, is_print=False) '''init: agent, buffer, recorder''' recorder = Recorder() agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() is_online_policy = bool(rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'}) if is_online_policy: buffer = BufferTupleOnline(max_memo) else: buffer = BufferArray(max_memo, state_dim, 1 if is_discrete else action_dim) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''loop''' is_training = True while is_training: '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer( env, buffer, max_step, max_action, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' buffer.init_before_sample() loss_a, loss_c = agent.update_parameters( buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' with torch.no_grad(): # for saving the GPU buffer recorder.update__record_explore(steps, rewards, loss_a, loss_c) is_saved = recorder.update__record_evaluate( env, agent.act, max_step, max_action, eva_size, agent.device, is_discrete) recorder.save_act(cwd, agent.act, gpu_id) if is_saved else None is_solved = recorder.check_is_solved(target_reward, gpu_id, show_gap) '''break loop rules''' if is_solved or recorder.total_step > max_total_step or os.path.exists(f'{cwd}/stop.mark'): is_training = False recorder.save_npy__plot_png(cwd)
def process__buffer(q_aggr, qs_dist, args, **_kwargs): max_memo = args.max_memo env_name = args.env_name max_step = args.max_step batch_size = args.batch_size repeat_times = 2 reward_scale = args.reward_scale gamma = args.gamma '''init''' env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info( env, is_print=False) buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer workers_num = len(qs_dist) '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer( # env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) is_training = True while is_training: for i in range(workers_num): memo_array, is_solved = q_aggr.get() buffer.extend_memo(memo_array) if is_solved: is_training = False buffer.init_before_sample() for i in range(max_step * repeat_times): # batch_arrays = buffer.random_sample(batch_size, device=None) # todo for q_dist in qs_dist: batch_arrays = buffer.random_sample(batch_size, device=None) # todo slower q_dist.put(batch_arrays) print('|| Exit: process__buffer')
def mp__update_params(args, q_i_eva, q_o_eva): # 2020-11-11 update network parameters using replay buffer rl_agent = args.rl_agent max_memo = args.max_memo net_dim = args.net_dim max_step = args.max_step max_total_step = args.break_step batch_size = args.batch_size repeat_times = args.repeat_times cwd = args.cwd env_name = args.env_name reward_scale = args.reward_scale if_stop = args.if_break_early gamma = args.gamma del args env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False) '''build agent''' agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() '''send agent to q_i_eva''' from copy import deepcopy act_cpu = deepcopy(agent.act).to(torch.device("cpu")) act_cpu.eval() [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()] q_i_eva.put(act_cpu) # q_i_eva 1. '''build replay buffer, init: total_step, reward_avg''' total_step = 0 if bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}): buffer = BufferArrayGPU(max_memo + max_step, state_dim, action_dim, if_ppo=True) # experiment replay buffer with torch.no_grad(): reward_avg = get_episode_reward(env, act_cpu, max_step, torch.device("cpu"), if_discrete) else: buffer = BufferArrayGPU(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False) '''initial exploration''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) reward_avg = np.average(rewards) step_sum = sum(steps) '''pre training and hard update before training loop''' buffer.update_pointer_before_sample() agent.update_policy(buffer, max_step, batch_size, repeat_times) if 'act_target' in dir(agent): agent.act_target.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, 0, 0)) # q_i_eva n. total_step += step_sum '''training loop''' if_train = True if_solve = False while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) reward_avg = np.average(rewards) if len(rewards) else reward_avg step_sum = sum(steps) total_step += step_sum '''update network parameters by random sampling buffer for gradient descent''' buffer.update_pointer_before_sample() loss_a_avg, loss_c_avg = agent.update_policy(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' act_cpu.load_state_dict(agent.act.state_dict()) q_i_eva.put((act_cpu, reward_avg, step_sum, loss_a_avg, loss_c_avg)) # q_i_eva n. if q_o_eva.qsize() > 0: if_solve = q_o_eva.get() # q_o_eva n. '''break loop rules''' if_train = not ((if_stop and if_solve) or total_step > max_total_step or os.path.exists(f'{cwd}/stop')) env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False) buffer.print_state_norm(env.neg_state_avg, env.div_state_std) q_i_eva.put('stop') while q_i_eva.qsize() > 0: time.sleep(1) time.sleep(4)
def train_agent( # 2020-11-11 rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step, batch_size, repeat_times, reward_scale, gamma, break_step, if_break_early, show_gap, eval_times1, eval_times2, **_kwargs): # 2020-09-18 env, state_dim, action_dim, target_reward, if_discrete = build_env( env_name, if_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() if bool(rl_agent.__name__ in { 'AgentPPO', }): buffer = BufferTupleOnline(max_memo) elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}): buffer = BufferArray(max_memo + max_step, state_dim, action_dim, if_ppo=True) else: buffer = BufferArray(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''pre training and hard update before training loop''' buffer.update_pointer_before_sample() agent.update_policy(buffer, max_step, batch_size, repeat_times) if 'act_target' in dir(agent): agent.act_target.load_state_dict(agent.act.state_dict()) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' loss_a, loss_c = agent.update_policy(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, agent.device, if_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None with torch.no_grad(): # for saving the GPU buffer if_solve = recorder.check__if_solved(target_reward, gpu_id, show_gap, cwd) '''break loop rules''' if_train = not ((if_break_early and if_solve) or recorder.total_step > break_step or os.path.exists(f'{cwd}/stop')) recorder.save_npy__draw_plot(cwd) print(f'SavedDir: {cwd}\n' f'UsedTime: {time.time() - recorder.start_time:.0f}') buffer.print_state_norm(env.neg_state_avg, env.div_state_std)