def run__tutorial_discrete_action(): """It is a DQN tutorial, we need 1min for training. This simplify DQN can't work well on harder task. Other RL algorithms can work well on harder task but complicated. You can change this code and make the training finish in (10 sec, 10k step) as an execrise. """ env_name = 'CartPole-v0' # a tutorial RL env. We need 10s for training. env = gym.make(env_name) # an OpenAI standard env state_dim = 4 action_dim = 2 action_max = int(1) target_reward = 195.0 is_discrete = True # from AgentRun import get_env_info # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=True) # assert is_discrete is True # DQN is for discrete action space. """ You will see the following: | env_name: <CartPoleEnv<CartPole-v0>>, action space: Discrete | state_dim: 4, action_dim: 2, action_max: 1, target_reward: 195.0 """ ''' I copy the code from AgentDQN to the following for tutorial.''' net_dim = 2**7 # the dimension (or width) of network learning_rate = 2e-4 # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum) max_buffer = 2**12 # the max storage number of replay buffer. max_epoch = 2**12 # epoch or episodes when training step max_step = 2**9 # the max step that actor interact with env before training critic gamma = 0.99 # reward discount factor (gamma must less than 1.0) batch_size = 2**6 # batch_size for network training criterion = torch.nn.MSELoss() # criterion for critic's q_value estimate device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose GPU or CPU automatically ''' QNet is an actor or critic? DQN is not a Actor-Critic Method. AgentDQN chooses action with the largest q value outputing by Q_Network. Q_Network is an actor. AgentDQN outputs q_value by Q_Network. Q_Network is also a critic. ''' act = QNet(state_dim, action_dim, net_dim).to(device) act.train() act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate) act_target = QNet(state_dim, action_dim, net_dim).to(device) act_target.load_state_dict(act.state_dict()) act_target.eval() # from AgentRun import BufferList # simpler but slower from AgentZoo import BufferArray # faster but a bit complicated buffer = BufferArray( max_buffer, state_dim, action_dim=1) # experiment replay buffer, discrete action is an int '''training loop''' self_state = env.reset() self_steps = 0 # steps of an episode self_r_sum = 0.0 # sum of rewards of an episode with exploration total_step = 0 # total step before training st0p evaluator = EvaluateRewardSV(env) # SV: Simplify Version for tutorial max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) # the max r_sum without exploration start_time = time.time() for epoch in range(max_epoch): '''update_buffer''' explore_rate = 0.1 # explore rate when update_buffer(), epsilon-greedy rewards = list() steps = list() for _ in range(max_step): if rd.rand( ) < explore_rate: # epsilon-Greedy: explored policy for DQN action = rd.randint(action_dim) else: states = torch.tensor((self_state, ), dtype=torch.float32, device=device) actions = act_target(states).argmax( dim=1).cpu().data.numpy() # discrete action space action = actions[0] next_state, reward, done, _ = env.step(action) self_r_sum += reward self_steps += 1 mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) self_state = next_state if done: rewards.append(self_r_sum) self_r_sum = 0.0 steps.append(self_steps) self_steps = 0 self_state = env.reset() total_step += sum(steps) avg_reward = np.average(rewards) print(end=f'Reward:{avg_reward:6.1f} Step:{total_step:8} ') '''update_parameters''' loss_c_sum = 0.0 update_times = max_step buffer.init_before_sample() # update the buffer.now_len for _ in range(update_times): with torch.no_grad(): rewards, masks, states, actions, next_states = buffer.random_sample( batch_size, device) next_q_target = act_target(next_states).max(dim=1, keepdim=True)[0] q_target = rewards + masks * next_q_target act.train() actions = actions.type(torch.long) q_eval = act(states).gather(1, actions) critic_loss = criterion(q_eval, q_target) loss_c_sum += critic_loss.item() act_optim.zero_grad() critic_loss.backward() act_optim.step() soft_target_update(act_target, act, tau=5e-2) # soft_target_update(act_target, act, tau=5e-3) ''' A small tau can stabilize training in harder env. You can change tau into smaller tau 5e-3. But this env is too easy. You can try the harder env and other DRL Algorithms in run__xx() in AgentRun.py ''' # loss_a_avg = 0.0 loss_c_avg = loss_c_sum / update_times print(end=f'Loss:{loss_c_avg:6.1f} ') # evaluate the true reward of this agent without exploration eva_reward_list = [ evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) for _ in range(3) ] eva_reward = np.average(eva_reward_list) print(f'TrueRewward:{eva_reward:6.1f}') if eva_reward > max_reward: max_reward = eva_reward if max_reward > target_reward: print( f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}" ) break used_time = int(time.time() - start_time) print(f"|\tTraining UsedTime: {used_time}s")
def run__tutorial_continuous_action(): """It is a DDPG tutorial, we need about 300s for training. I hate OU Process because of its lots of hyper-parameters. So this DDPG has no OU Process. This simplify DDPG can't work well on harder task. Other RL algorithms can work well on harder task but complicated. You can change this code and make the training finish in 100s. """ env_name = 'Pendulum-v0' # a tutorial RL env. We need 300s for training. env = gym.make(env_name) # an OpenAI standard env state_dim = 3 action_dim = 1 action_max = 2.0 target_reward = -100.0 is_discrete = False # from AgentRun import get_env_info # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info( # env, is_print=True, target_reward=-100.0) # assert is_discrete is False # DDPG is for discrete action space. """ You will see the following: | env_name: <PendulumEnv<Pendulum-v0>>, action space: Continuous | state_dim: 3, action_dim: 1, action_max: 2.0, target_reward: 0.0 """ ''' I copy the code from AgentDQN to the following for tutorial.''' net_dim = 2**5 # the dimension (or width) of network learning_rate = 2e-4 # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum) max_buffer = 2**14 # the max storage number of replay buffer. max_epoch = 2**12 # epoch or episodes when training step max_step = 2**8 # the max step that actor interact with env before training critic gamma = 0.99 # reward discount factor (gamma must less than 1.0) batch_size = 2**7 # batch_size for network training update_freq = 2**7 criterion = torch.nn.SmoothL1Loss( ) # criterion for critic's q_value estimate device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose GPU or CPU automatically act_dim = net_dim act = Actor(state_dim, action_dim, act_dim).to(device) act.train() act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate) act_target = Actor(state_dim, action_dim, act_dim).to(device) act_target.load_state_dict(act.state_dict()) act_target.eval() cri_dim = int(net_dim * 1.25) cri = Critic(state_dim, action_dim, cri_dim).to(device) cri.train() cri_optim = torch.optim.Adam(cri.parameters(), lr=learning_rate) cri_target = Critic(state_dim, action_dim, cri_dim).to(device) cri_target.load_state_dict(cri.state_dict()) cri_target.eval() # from AgentRun import BufferList # simpler but slower from AgentZoo import BufferArray # faster but a bit complicated buffer = BufferArray(max_buffer, state_dim, action_dim) # experiment replay buffer '''training loop''' self_state = env.reset() self_steps = 0 # the steps of an episode self_r_sum = 0.0 # the sum of rewards of an episode with exploration total_step = 0 explore_noise = 0.05 evaluator = EvaluateRewardSV(env) # SV: Simplify Version for tutorial max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) # the max r_sum without exploration start_time = time.time() while total_step < max_step: # collect buffer before training for _ in range(max_step): action = rd.uniform(-1, 1, size=action_dim) next_state, reward, done, _ = env.step(action * action_max) mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) total_step += 1 if done: self_state = env.reset() break self_state = next_state for epoch in range(max_epoch): '''update_buffer''' explore_rate = 0.5 # explore rate when update_buffer(), epsilon-greedy reward_list = list() step_list = list() for _ in range(max_step): states = torch.tensor((self_state, ), dtype=torch.float32, device=device) actions = act_target( states).cpu().data.numpy() # discrete action space action = actions[0] if rd.rand() < explore_rate: action = rd.normal(action, explore_noise).clip(-1, +1) next_state, reward, done, _ = env.step(action * action_max) self_r_sum += reward self_steps += 1 mask = 0.0 if done else gamma buffer.add_memo((reward, mask, self_state, action, next_state)) self_state = next_state if done: reward_list.append(self_r_sum) self_r_sum = 0.0 step_list.append(self_steps) self_steps = 0 self_state = env.reset() total_step += sum(step_list) avg_reward = np.average(reward_list) print(end=f'Reward:{avg_reward:8.1f} Step:{total_step:8} ') '''update_parameters''' loss_a_sum = 0.0 loss_c_sum = 0.0 update_times = max_step buffer.init_before_sample() # update the buffer.now_len for i in range(update_times): for _ in range(2): # Two Time-scale Update Rule (TTUR) with torch.no_grad(): reward, mask, state, action, next_state = buffer.random_sample( batch_size, device) next_action = act_target(next_state) next_q_target = cri_target(next_state, next_action) q_target = reward + mask * next_q_target q_eval = cri(state, action) critic_loss = criterion(q_eval, q_target) loss_c_sum += critic_loss.item() cri_optim.zero_grad() critic_loss.backward() cri_optim.step() action_pg = act(state) # policy gradient actor_loss = -cri(state, action_pg).mean() # policy gradient loss_a_sum += actor_loss.item() act_optim.zero_grad() actor_loss.backward() act_optim.step() '''soft target update''' # soft_target_update(cri_target, cri, tau=5e-3) # soft_target_update(act_target, act, tau=5e-3) '''hard target update''' if i % update_freq == 0: cri_target.load_state_dict(cri.state_dict()) act_target.load_state_dict(act.state_dict()) loss_c_avg = loss_c_sum / (update_times * 2) loss_a_avg = loss_a_sum / update_times print(end=f'LossC:{loss_c_avg:6.1f} LossA:{loss_a_avg:6.1f} ') # evaluate the true reward of this agent without exploration eva_reward_list = [ evaluator.get_eva_reward__sv(act, max_step, action_max, is_discrete) for _ in range(3) ] eva_reward = np.average(eva_reward_list) print(f'TrueRewward:{eva_reward:8.1f}') if eva_reward > max_reward: max_reward = eva_reward if max_reward > target_reward: print( f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}" ) break used_time = int(time.time() - start_time) print(f"|\tTraining UsedTime: {used_time}s")