def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_epoch, **_kwargs): # 2020-06-01 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) '''init''' agent = class_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() buffer = BufferArray(max_memo, state_dim, action_dim=1 if is_discrete else action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) # unnecessary '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, loss_a=0, loss_c=0) try: for epoch in range(max_epoch): # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) # update network parameters by random sampling buffer for gradient descent buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("| raise KeyboardInterrupt and break training loop") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)
def train_agent( # 2020-11-11 rl_agent, env_name, gpu_id, cwd, net_dim, max_memo, max_step, batch_size, repeat_times, reward_scale, gamma, break_step, if_break_early, show_gap, eval_times1, eval_times2, **_kwargs): # 2020-09-18 env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() if bool(rl_agent.__name__ in {'AgentPPO', }): buffer = BufferTupleOnline(max_memo) elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}): buffer = BufferArray(max_memo + max_step, state_dim, action_dim, if_ppo=True) else: buffer = BufferArray(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''pre training and hard update before training loop''' buffer.update_pointer_before_sample() agent.update_policy(buffer, max_step, batch_size, repeat_times) if 'act_target' in dir(agent): agent.act_target.load_state_dict(agent.act.state_dict()) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # speed up running rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' loss_a, loss_c = agent.update_policy(buffer, max_step, batch_size, repeat_times) '''saves the agent with max reward''' recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, agent.device, if_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None with torch.no_grad(): # for saving the GPU buffer if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap, cwd) '''break loop rules''' if_train = not ((if_break_early and if_solve) or recorder.total_step > break_step or os.path.exists(f'{cwd}/stop')) recorder.save_npy__draw_plot(cwd) buffer.print_state_norm(env.neg_state_avg, env.div_state_std)
def train_offline_policy(rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_step, max_memo, max_total_step, **_kwargs): # 2020-06-01 env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( env, is_print=False) assert not is_discrete '''init: agent, buffer, recorder''' agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer recorder = Recorder(agent, max_step, max_action, target_reward, env_name, **_kwargs) # unnecessary '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, loss_a=0, loss_c=0) while True: # update replay buffer by interact with environment with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) # update network parameters by random sampling buffer for gradient descent buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # show/check the reward, save the max reward actor with torch.no_grad(): # for saving the GPU buffer # NOTICE! Recorder saves the agent with max reward automatically. recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: print('Reach target_reward: ', target_reward, recorder.reward_max) break if recorder.total_step > max_total_step: print('Reach target_step: ', max_total_step, recorder.total_step) break train_time = recorder.print_and_save_npy(env_name, cwd) if is_solved: agent.save_or_load_model(cwd, is_save=True) draw_plot_with_npy(cwd, train_time)
def train_agent_discrete( class_agent, env_name, cwd, net_dim, max_step, max_memo, max_epoch, # env batch_size, gamma, update_gap, reward_scale, **_kwargs): # 2020-05-20 env = gym.make(env_name) '''init''' state_dim, action_dim, action_max, target_reward = get_env_info( env, is_print=True) assert isinstance(action_max, int) # means Discrete action space agent = class_agent(env, state_dim, action_dim, net_dim) # training agent buffer = BufferArray(max_memo, state_dim, action_dim=1) # experiment replay buffer recorder = Recorder(agent, max_step, action_max, target_reward, env_name, **_kwargs) '''loop''' with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, action_max, reward_scale, gamma, action_dim) recorder.show_reward(rewards, steps, 0, 0) try: for epoch in range(max_epoch): '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, action_max, reward_scale, gamma) '''update network parameters by random sampling buffer for stochastic gradient descent''' loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, update_gap) '''show/check the reward, save the max reward actor''' with torch.no_grad(): # for saving the GPU buffer '''NOTICE! Recorder saves the agent with max reward automatically. ''' recorder.show_reward(rewards, steps, loss_a, loss_c) is_solved = recorder.check_reward(cwd, loss_a, loss_c) if is_solved: break except KeyboardInterrupt: print("raise KeyboardInterrupt while training.") # except AssertionError: # for BipedWalker BUG 2020-03-03 # print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.") # return False train_time = recorder.print_and_save_npy(env_name, cwd) # agent.save_or_load_model(cwd, is_save=True) # save max reward agent in Recorder # buffer.save_or_load_memo(cwd, is_save=True) draw_plot_with_npy(cwd, train_time) return True
def process__buffer(q_aggr, qs_dist, args, **_kwargs): max_memo = args.max_memo env_name = args.env_name max_step = args.max_step batch_size = args.batch_size repeat_times = 2 # reward_scale = args.reward_scale # gamma = args.gamma '''init''' env = gym.make(env_name) state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=False) buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer workers_num = len(qs_dist) '''loop''' is_training = True while is_training: for i in range(workers_num): memo_array, is_solved = q_aggr.get() buffer.extend_memo(memo_array) if is_solved: is_training = False buffer.init_before_sample() for i in range(max_step * repeat_times): # batch_arrays = buffer.random_sample(batch_size, device=None) # faster but worse for q_dist in qs_dist: batch_arrays = buffer.random_sample(batch_size, device=None) # slower but better q_dist.put(batch_arrays) print('|| Exit: process__buffer')
def process__buffer(q_aggr, qs_dist, args, **_kwargs): max_memo = args.max_memo env_name = args.env_name max_step = args.max_step batch_size = args.batch_size repeat_times = 2 reward_scale = args.reward_scale gamma = args.gamma '''init''' env = gym.make(env_name) state_dim, action_dim, max_action, target_reward = get_env_info( env, be_quiet=False) buffer = BufferArray(max_memo, state_dim, action_dim) # experiment replay buffer workers_num = len(qs_dist) '''loop''' with torch.no_grad(): # update replay buffer # rewards, steps = agent.update_buffer( # env, buffer, max_step, max_action, reward_scale, gamma) rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) while True: for _ in range(workers_num): memo_array = q_aggr.get() buffer.extend_memo(memo_array) buffer.init_before_sample() for _ in range(max_step * repeat_times): for q_dist in qs_dist: batch_arrays = buffer.random_sample(batch_size, device=None) q_dist.put(batch_arrays)
def train_agent(rl_agent, net_dim, batch_size, repeat_times, gamma, reward_scale, cwd, env_name, max_memo, max_step, max_total_step, eval_times1, eval_times2, gpu_id, show_gap, if_stop, **_kwargs): # 2020-06-01 env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env( env_name, is_print=False) '''init: agent, buffer, recorder''' recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2) # todo eva_size1 agent = rl_agent(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() is_online_policy = bool( rl_agent.__name__ in {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'}) if is_online_policy: buffer = BufferTupleOnline(max_memo) else: buffer = BufferArray(max_memo, state_dim, 1 if is_discrete else action_dim) with torch.no_grad(): # update replay buffer rewards, steps = initial_exploration(env, buffer, max_step, max_action, reward_scale, gamma, action_dim) recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0) '''loop''' if_train = True while if_train: '''update replay buffer by interact with environment''' with torch.no_grad(): # for saving the GPU buffer rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma) '''update network parameters by random sampling buffer for gradient descent''' buffer.init_before_sample() loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size, repeat_times) # if loss_c > 4: # todo backtracking # agent.save_or_load_model(cwd, if_save=False) '''saves the agent with max reward''' with torch.no_grad(): # for saving the GPU buffer recorder.update__record_explore(steps, rewards, loss_a, loss_c) if_save = recorder.update__record_evaluate(env, agent.act, max_step, max_action, agent.device, is_discrete) recorder.save_act(cwd, agent.act, gpu_id) if if_save else None recorder.save_npy__plot_png(cwd) if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap) '''break loop rules''' if_train = not ((if_stop and if_solve) or recorder.total_step > max_total_step or os.path.exists(f'{cwd}/stop.mark')) recorder.save_npy__plot_png(cwd)
def run__tutorial_discrete_action(): """It is a DQN tutorial, we need 1min for training. This simplify DQN can't work well on harder task. Other RL algorithms can work well on harder task but complicated. You can change this code and make the training finish in (10 sec, 10k step) as an execrise. """ env_name = 'CartPole-v0' # a tutorial RL env. We need 10s for training. env = gym.make(env_name) # an OpenAI standard env state_dim = 4 action_dim = 2 action_max = int(1) target_reward = 195.0 is_discrete = True # from AgentRun import get_env_info # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=True) # assert is_discrete is True # DQN is for discrete action space. """ You will see the following: | env_name: <CartPoleEnv<CartPole-v0>>, action space: Discrete | state_dim: 4, action_dim: 2, action_max: 1, target_reward: 195.0 """ ''' I copy the code from AgentDQN to the following for tutorial.''' net_dim = 2**7 # the dimension (or width) of network learning_rate = 2e-4 # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum) max_buffer = 2**12 # the max storage number of replay buffer. max_epoch = 2**12 # epoch or episodes when training step max_step = 2**9 # the max step that actor interact with env before training critic gamma = 0.99 # reward discount factor (gamma must less than 1.0) batch_size = 2**6 # batch_size for network training criterion = torch.nn.MSELoss() # criterion for critic's q_value estimate device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose GPU or CPU automatically ''' QNet is an actor or critic? DQN is not a Actor-Critic Method. AgentDQN chooses action with the largest q value outputing by Q_Network. Q_Network is an actor. AgentDQN outputs q_value by Q_Network. Q_Network is also a critic. ''' act = QNet(state_dim, action_dim, net_dim).to(device) act.train() act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate) act_target = QNet(state_dim, action_dim, net_dim).to(device) act_target.load_state_dict(act.state_dict()) act_target.eval() # from AgentRun import BufferList # simpler but slower from AgentZoo import BufferArray # faster but a bit complicated buffer = BufferArray( max_buffer, state_dim, action_dim=1) # experiment replay buffer, discrete action is an int '''training loop''' self_state = env.reset() self_steps = 0 # steps of an episode self_r_sum = 0.0 # sum of rewards of an episode with exploration total_step = 0 # total step before training st0p evaluator = EvaluateRewardSV(env) # SV: Simplify Version for tutorial max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) # the max r_sum without exploration start_time = time.time() for epoch in range(max_epoch): '''update_buffer''' explore_rate = 0.1 # explore rate when update_buffer(), epsilon-greedy rewards = list() steps = list() for _ in range(max_step): if rd.rand( ) < explore_rate: # epsilon-Greedy: explored policy for DQN action = rd.randint(action_dim) else: states = torch.tensor((self_state, ), dtype=torch.float32, device=device) actions = act_target(states).argmax( dim=1).cpu().data.numpy() # discrete action space action = actions[0] next_state, reward, done, _ = env.step(action) self_r_sum += reward self_steps += 1 mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) self_state = next_state if done: rewards.append(self_r_sum) self_r_sum = 0.0 steps.append(self_steps) self_steps = 0 self_state = env.reset() total_step += sum(steps) avg_reward = np.average(rewards) print(end=f'Reward:{avg_reward:6.1f} Step:{total_step:8} ') '''update_parameters''' loss_c_sum = 0.0 update_times = max_step buffer.init_before_sample() # update the buffer.now_len for _ in range(update_times): with torch.no_grad(): rewards, masks, states, actions, next_states = buffer.random_sample( batch_size, device) next_q_target = act_target(next_states).max(dim=1, keepdim=True)[0] q_target = rewards + masks * next_q_target act.train() actions = actions.type(torch.long) q_eval = act(states).gather(1, actions) critic_loss = criterion(q_eval, q_target) loss_c_sum += critic_loss.item() act_optim.zero_grad() critic_loss.backward() act_optim.step() soft_target_update(act_target, act, tau=5e-2) # soft_target_update(act_target, act, tau=5e-3) ''' A small tau can stabilize training in harder env. You can change tau into smaller tau 5e-3. But this env is too easy. You can try the harder env and other DRL Algorithms in run__xx() in AgentRun.py ''' # loss_a_avg = 0.0 loss_c_avg = loss_c_sum / update_times print(end=f'Loss:{loss_c_avg:6.1f} ') # evaluate the true reward of this agent without exploration eva_reward_list = [ evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) for _ in range(3) ] eva_reward = np.average(eva_reward_list) print(f'TrueRewward:{eva_reward:6.1f}') if eva_reward > max_reward: max_reward = eva_reward if max_reward > target_reward: print( f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}" ) break used_time = int(time.time() - start_time) print(f"|\tTraining UsedTime: {used_time}s")
def run__tutorial_continuous_action(): """It is a DDPG tutorial, we need about 300s for training. I hate OU Process because of its lots of hyper-parameters. So this DDPG has no OU Process. This simplify DDPG can't work well on harder task. Other RL algorithms can work well on harder task but complicated. You can change this code and make the training finish in 100s. """ env_name = 'Pendulum-v0' # a tutorial RL env. We need 300s for training. env = gym.make(env_name) # an OpenAI standard env state_dim = 3 action_dim = 1 action_max = 2.0 target_reward = -100.0 is_discrete = False # from AgentRun import get_env_info # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( # env, is_print=True, target_reward=-100.0) # assert is_discrete is False # DDPG is for discrete action space. """ You will see the following: | env_name: <PendulumEnv<Pendulum-v0>>, action space: Continuous | state_dim: 3, action_dim: 1, action_max: 2.0, target_reward: 0.0 """ ''' I copy the code from AgentDQN to the following for tutorial.''' net_dim = 2**5 # the dimension (or width) of network learning_rate = 2e-4 # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum) max_buffer = 2**14 # the max storage number of replay buffer. max_epoch = 2**12 # epoch or episodes when training step max_step = 2**8 # the max step that actor interact with env before training critic gamma = 0.99 # reward discount factor (gamma must less than 1.0) batch_size = 2**7 # batch_size for network training update_freq = 2**7 criterion = torch.nn.SmoothL1Loss( ) # criterion for critic's q_value estimate device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose GPU or CPU automatically act_dim = net_dim act = Actor(state_dim, action_dim, act_dim).to(device) act.train() act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate) act_target = Actor(state_dim, action_dim, act_dim).to(device) act_target.load_state_dict(act.state_dict()) act_target.eval() cri_dim = int(net_dim * 1.25) cri = Critic(state_dim, action_dim, cri_dim).to(device) cri.train() cri_optim = torch.optim.Adam(cri.parameters(), lr=learning_rate) cri_target = Critic(state_dim, action_dim, cri_dim).to(device) cri_target.load_state_dict(cri.state_dict()) cri_target.eval() # from AgentRun import BufferList # simpler but slower from AgentZoo import BufferArray # faster but a bit complicated buffer = BufferArray(max_buffer, state_dim, action_dim) # experiment replay buffer '''training loop''' self_state = env.reset() self_steps = 0 # the steps of an episode self_r_sum = 0.0 # the sum of rewards of an episode with exploration total_step = 0 explore_noise = 0.05 evaluator = EvaluateRewardSV(env) # SV: Simplify Version for tutorial max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) # the max r_sum without exploration start_time = time.time() while total_step < max_step: # collect buffer before training for _ in range(max_step): action = rd.uniform(-1, 1, size=action_dim) next_state, reward, done, _ = env.step(action * action_max) mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) total_step += 1 if done: self_state = env.reset() break self_state = next_state for epoch in range(max_epoch): '''update_buffer''' explore_rate = 0.5 # explore rate when update_buffer(), epsilon-greedy reward_list = list() step_list = list() for _ in range(max_step): states = torch.tensor((self_state, ), dtype=torch.float32, device=device) actions = act_target( states).cpu().data.numpy() # discrete action space action = actions[0] if rd.rand() < explore_rate: action = rd.normal(action, explore_noise).clip(-1, +1) next_state, reward, done, _ = env.step(action * action_max) self_r_sum += reward self_steps += 1 mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) self_state = next_state if done: reward_list.append(self_r_sum) self_r_sum = 0.0 step_list.append(self_steps) self_steps = 0 self_state = env.reset() total_step += sum(step_list) avg_reward = np.average(reward_list) print(end=f'Reward:{avg_reward:8.1f} Step:{total_step:8} ') '''update_parameters''' loss_a_sum = 0.0 loss_c_sum = 0.0 update_times = max_step buffer.init_before_sample() # update the buffer.now_len for i in range(update_times): for _ in range(2): # Two Time-scale Update Rule (TTUR) with torch.no_grad(): reward, mask, state, action, next_state = buffer.random_sample( batch_size, device) next_action = act_target(next_state) next_q_target = cri_target(next_state, next_action) q_target = reward + mask * next_q_target q_eval = cri(state, action) critic_loss = criterion(q_eval, q_target) loss_c_sum += critic_loss.item() cri_optim.zero_grad() critic_loss.backward() cri_optim.step() action_pg = act(state) # policy gradient actor_loss = -cri(state, action_pg).mean() # policy gradient loss_a_sum += actor_loss.item() act_optim.zero_grad() actor_loss.backward() act_optim.step() '''soft target update''' # soft_target_update(cri_target, cri, tau=5e-3) # soft_target_update(act_target, act, tau=5e-3) '''hard target update''' if i % update_freq == 0: cri_target.load_state_dict(cri.state_dict()) act_target.load_state_dict(act.state_dict()) loss_c_avg = loss_c_sum / (update_times * 2) loss_a_avg = loss_a_sum / update_times print(end=f'LossC:{loss_c_avg:6.1f} LossA:{loss_a_avg:6.1f} ') # evaluate the true reward of this agent without exploration eva_reward_list = [ evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) for _ in range(3) ] eva_reward = np.average(eva_reward_list) print(f'TrueRewward:{eva_reward:8.1f}') if eva_reward > max_reward: max_reward = eva_reward if max_reward > target_reward: print( f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}" ) break used_time = int(time.time() - start_time) print(f"|\tTraining UsedTime: {used_time}s")