def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE Q = q_func(input_arg, num_actions) Q_target = q_func(input_arg, num_actions) ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() if (t > learning_starts): action = select_epilson_greedy_action(Q, encoded_obs, t) else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if (done): last_obs = env.reset() else: last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE # # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main. # Move to GPU if possible # done flag in loop ---- SKIPPED IF DONE IS TRUE # clipping the error between -1 and 1 -- OK # backward the error meaning? # Suggestion for changing parameters - change exploration scehdule (main) # # Q.cuda() obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size=batch_size) states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) actions = Variable(torch.from_numpy(act_batch).long()) rewards = Variable(torch.from_numpy(reward_batch).float()) next_states = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype)) if USE_CUDA: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() next_states = next_states.cuda() Q.train() Q_target.eval() predicted_rewards = Q(states).gather(1, actions.unsqueeze(1)) #Q(s,a) next_max_Q = Q_target(next_states).detach().max(1)[ 0] #.unsqueeze(1) #Q_target(s,a) next_Q_values = not_dones * next_max_Q target_Q_values = rewards + (gamma * next_Q_values) #r + Q_target bellman_error = target_Q_values - predicted_rewards.squeeze(1) clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0) optimizer.zero_grad() predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1)) optimizer.step() num_param_updates += +1 if (num_param_updates % target_update_freq == 0): Q_target.load_state_dict(Q.state_dict()) # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask): # if(done == 1.0): # continue # obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True) # next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False) # current_Q = Q(obs) # predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True) # target_reward = Q_target(next_obs).data.max(1)[0] # loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0) # optimizer.zero_grad() # # should be current.backward(d_error.data.unsqueeze(1)) # # but it crashes on misfitting dims # predicted_reward.backward(loss.data.unsqueeze(1)) # optimizer.step() ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data.unsqueeze(1)) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data.unsqueeze(1)) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t))
def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 return model(Variable(obs, volatile=True)).data.max(1)[1].view(1, 1) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 rewards = [] episodes = [] reward_list = [] for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: #action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] action = select_epilson_greedy_action(Q, recent_observations, t)[0][0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) if t == 20000000: pylab.plot(episodes, reward_list, 'b') pylab.savefig("./save_graph/breakout_dqn.png") # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. # current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) current_Q_values = Q(obs_batch).gather( 1, act_batch.unsqueeze(1) ).squeeze( ) # squeeze the [batch_size x 1] Tensor to have a shape of batch_size # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # # Compute Bellman error # bellman_error = target_Q_values - current_Q_values # # clip the bellman error between [-1 , 1] # clipped_bellman_error = bellman_error.clamp(-1, 1) # # Note: clipped_bellman_delta * -1 will be right gradient # d_error = clipped_bellman_error * -1.0 loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) loss.backward() # Clip the gradients to lie between -1 and +1 for params in Q.parameters(): params.grad.data.clamp_(-1, 1) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() reward_list.append(mean_episode_reward) episodes.append(episode_rewards) # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, statistics_file_name=run.statistics_file_name ) if __name__ == '__main__': # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) # do not record videos: get_wrapper_by_name(env, "Monitor").video_callable = lambda episode_id: False main(env, task.max_timesteps)
def dqn_learing(env, q_func, optimizer_spec, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function # optimizer_func = construct_optimizer_func(Q, optimizer_spec) optimizer = torch.optim.Adam(Q.parameters()) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model recent_observations = replay_buffer.encode_recent_observation( ).transpose(2, 0, 1) # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch.transpose(0, 3, 1, 2)).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch.transpose( 0, 3, 1, 2)).type(dtype) / 255.0) done_mask = torch.from_numpy(done_mask) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() done_mask = done_mask.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value, based on which acion gives max Q values next_max_Q_values = Variable(torch.zeros(batch_size).type(dtype)) # # Detach variable from the current graph since we don't want gradients to propagated next_max_Q_values[done_mask == 0] = target_Q( next_obs_batch).detach().max(1)[0] # Compute Bellman error, use huber loss to mitigate outlier impact target_Q_values = rew_batch + (gamma * next_max_Q_values) bellman_error = F.smooth_l1_loss(current_Q_values, target_Q_values) # Construct and optimizer and clear previous gradients optimizer = optimizer_func(t) optimizer.zero_grad() # run backward pass and clip the gradient bellman_error.backward() nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) sys.stdout.flush()
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, statistics_file_name="statistics.pkl"): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network statistics_file_name: str Where to store the statistics file """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete print("STATISTICS_FILE_NAME: {}".format(statistics_file_name)) ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type( torch_types.FloatTensor).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return random.randrange(num_actions) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE policy_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target target_net.load_state_dict( policy_net.state_dict()) # copies the state of policy Q into target ###### # Construct policy_net network optimizer function optimizer = optimizer_spec.constructor(policy_net.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE stored_frame_idx = replay_buffer.store_frame(last_obs) last_obs_encoded = replay_buffer.encode_recent_observation() action = select_epilson_greedy_action(policy_net, last_obs_encoded, t) obs, reward, done, info = env.step(action) replay_buffer.store_effect(stored_frame_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE sample = replay_buffer.sample(batch_size) obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample # convert batches to pytorch tensors: obs_batch = torch.from_numpy(obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 actions_batch = torch.from_numpy(actions_batch).to(device).type( torch_types.LongTensor) rewards_batch = torch.from_numpy(rewards_batch).to(device).type( torch_types.FloatTensor) non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type( torch_types.FloatTensor) # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html: # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(obs_batch).gather( 1, actions_batch.unsqueeze(1)).squeeze(1) # Compute V(s_{t+1}) for all next states. next_state_values = target_net(next_obs_batch).max( 1)[0].detach() * non_final_mask # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + rewards_batch # Compute loss d_error = state_action_values - expected_state_action_values # = -bellman_error d_error.clamp_(-1, 1) # Optimize the model optimizer.zero_grad() state_action_values.backward(d_error) optimizer.step() num_param_updates += 1 # Periodically update target network: if num_param_updates % target_update_freq == 0: target_net.load_state_dict(policy_net.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts: print("Timestep %d" % (t, )) print(" mean reward (100 episodes) %f" % mean_episode_reward) print(" best mean reward %f" % best_mean_episode_reward) print(" episodes %d" % len(episode_rewards)) print(" exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open(statistics_file_name, 'wb') as f: pickle.dump(Statistic, f)
def train(self): num_param_updates = 0 loss_acc_since_last_log = 0.0 param_updates_since_last_log = 0 num_episodes = 0 state = self.env.reset()[..., np.newaxis] for t in tqdm(range(self.total_timesteps)): last_idx = self.memory.store_frame(state) recent_observations = self.memory.encode_recent_observation() # Choose random action if learning hasn't started yet if t > self.learning_start: action = self.select_epsilon_greedy_action( recent_observations, t).item() else: action = random.randrange(self.num_actions) # Advance a step next_state, reward, done, _ = self.env.step(action) next_state = next_state[..., np.newaxis] # Store result in memory self.memory.store_effect(last_idx, action, reward, done) # Reset if done (life lost, due to atari wrapper) if done: next_state = self.env.reset() next_state = next_state[..., np.newaxis] state = next_state # Train network using experience replay when # memory is sufficiently large. if (t > self.learning_start and t % self.learning_freq == 0 and self.memory.can_sample(self.batch_size)): # Sample from replay buffer ( state_batch, act_batch, r_batch, next_state_batch, done_mask, ) = self.memory.sample(self.batch_size) state_batch = torch.from_numpy(state_batch).type( self.dtype) / 255.0 act_batch = torch.from_numpy(act_batch).long().to(self.device) r_batch = torch.from_numpy(r_batch).to(self.device) next_state_batch = ( torch.from_numpy(next_state_batch).type(self.dtype) / 255.0) not_done_mask = torch.from_numpy(1 - done_mask).type( self.dtype) # Calculate current Q value current_Q_vals = self.Q(state_batch).gather( 1, act_batch.unsqueeze(1)) # Calculate next Q value based on action that gives max Q vals next_max_Q = self.target_Q(next_state_batch).detach().max( dim=1)[0] next_Q_vals = not_done_mask * next_max_Q # Calculate target of current Q values target_Q_vals = r_batch + (self.gamma * next_Q_vals) # Calculate loss and backprop loss = F.smooth_l1_loss(current_Q_vals.squeeze(), target_Q_vals) self.optimizer.zero_grad() loss.backward() for param in self.Q.parameters(): param.grad.data.clamp_(-1, 1) # Update weights self.optimizer.step() num_param_updates += 1 # Store stats loss_acc_since_last_log += loss.item() param_updates_since_last_log += 1 # Update target network periodically if num_param_updates % self.target_update_freq == 0: self.target_Q.load_state_dict(self.Q.state_dict()) # Save model checkpoint if num_param_updates % self.checkpoint_frequency == 0: save_model_checkpoint( self.Q, self.optimizer, t, f"{self.out_dir}/checkpoints/{self.model_name}_{num_param_updates}", ) # Log progress if (num_param_updates % (self.log_freq // 2) == 0 and param_updates_since_last_log > 0): self.writer.add_scalar( "Mean Loss per Update (Updates)", loss_acc_since_last_log / param_updates_since_last_log, num_param_updates, ) loss_acc_since_last_log = 0.0 param_updates_since_last_log = 0 if num_param_updates % self.log_freq == 0: wrapper = get_wrapper_by_name(self.env, "Monitor") episode_rewards = wrapper.get_episode_rewards() mean_reward = round(np.mean(episode_rewards[-101:-1]), 2) sum_reward = np.sum(episode_rewards[-101:-1]) episode_lengths = wrapper.get_episode_lengths() mean_duration = round(np.mean(episode_lengths[-101:-1]), 2) sum_duration = np.sum(episode_lengths[-101:-1]) self.writer.add_scalar( f"Mean Reward (epoch = {self.log_freq} updates)", mean_reward, num_param_updates // self.log_freq, ) self.writer.add_scalar( f"Mean Duration (epoch = {self.log_freq} updates)", mean_duration, num_param_updates // self.log_freq, ) self.writer.add_scalar( f"Mean Reward per Timestep (epoch = {self.log_freq} updates)", round(sum_reward / sum_duration, 2), num_param_updates // self.log_freq, ) if done: num_episodes += 1 # Save model save_model(self.Q, f"{self.out_dir}/{self.model_name}.model") self.env.close() print(f"Number of Episodes: {num_episodes}") return self.Q
def should_stop(self): return (get_wrapper_by_name(self.env, "Monitor").get_total_steps() >= self.max_steps)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): print("running new version") """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. """ ---------------------------- OUR CODE ---------------------------- """ Q = q_func(input_arg, num_actions) # The parameters are random Qtag = q_func(input_arg, num_actions) if (USE_CUDA): Q.cuda() Qtag.cuda() Qtag.load_state_dict(Q.state_dict()) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) """ ------------------------------------------------------------------ """ ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() reward = None done = None info = None LOG_EVERY_N_STEPS = 10000 startTime = time.time() for t in count(): """ Tsuf: ---- Stuff for debigging times for various places --- """ T1 = 0 t1Tmp = 0 T2 = 0 t2Tmp = 0 T3 = 0 t3Tmp = 0 T4 = 0 t4Tmp = 0 T5 = 0 t5Tmp = 0 T6 = 0 t6Tmp = 0 T7 = 0 t7Tmp = 0 T8 = 0 t8Tmp = 0 """ ----------------------------------------------------------- """ ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break #if (t>1000000): # break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) """ -------------------------- OUR CODE -------------------------- """ #store last_obs, and get latest obs's as the input for the n.n t1Tmp = time.time() cur_idx = replay_buffer.store_frame(last_obs) next_input = replay_buffer.encode_recent_observation() T1 += time.time() - t1Tmp #take random action or use the net t2Tmp = time.time() action = select_epilson_greedy_action( Q, next_input, t) #the returned action is on the CPU T2 += time.time() - t2Tmp #see what happens after we take that action t3Tmp = time.time() last_obs, reward, done, info = env.step( action) #the returned parameters are on the CPU T3 += time.time() - t3Tmp # print(t) # env.render() #store the results on the replay buffer replay_buffer.store_effect(cur_idx, action, reward, done) #on the CPU #if the simulation is done, reset the environment if (done): last_obs = env.reset() """ -------------------------------------------------------------- """ # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) """ ------------------------ OUR CODE ------------------------ """ #sample a batch of history samples t4Tmp = time.time() obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) #on CPU obs_batch = torch.from_numpy(obs_batch).type( dtype) / 255.0 # When available, move the samples batch to GPU next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 #GPU T4 += time.time() - t4Tmp #see which Q values the current network gives, for all obs's t5Tmp = time.time() inter_Qs = Q( Variable(obs_batch)) #input is on GPU, output is on GPU inter_Qs_chosen = Variable( torch.zeros(batch_size).type(dtype)) #GPU #take the action that was chosen before for i in range(batch_size): inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]] #take only the intermediate (non-terminal) obs's inter_idx = np.where(done_mask == False)[0] #CPU inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :] T5 += time.time() - t5Tmp #see what the "target" (backuped) network says for the intermediate ones t6Tmp = time.time() inter_next_Qs = Qtag( Variable(inter_next_obs_batch, volatile=True)).data.max(1)[0] #All on GPU T6 += time.time() - t6Tmp #calculate the bellman errors t7Tmp = time.time() #for final obs's, the target is just the reward targets = torch.from_numpy(rew_batch).type( dtype) #Moved rew_batch to GPU (as 'targets') for (i, idx) in enumerate(inter_idx): targets[idx] += gamma * inter_next_Qs[i] #The bellman item # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!! [on GPU] # for i in range(len(errors)): # if errors[i]<-1: # errors[i] = -1 # elif errors[i]>1: # errors[i] = 1 errors = inter_Qs_chosen.data - targets errors.clamp(-1, 1) T7 += time.time() - t7Tmp #train the network! (: t8Tmp = time.time() optimizer.zero_grad() inter_Qs_chosen.backward( errors) #COULD BE WRONG WAY!! [Everything is on GPU (: ] optimizer.step() T8 += time.time() - t8Tmp num_param_updates += 1 if (num_param_updates % target_update_freq == 0): Qtag.load_state_dict(Q.state_dict()) """ ---------------------------------------------------------- """ ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) Statistic["running_times"].append(int(time.time() - startTime)) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: if (PRINT_TIMES): print("-----------------------") print(T1) print(T2) print(T3) print(T4) print(T5) print(T6) print(T7) print(T8) print("-----------------------") print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing( env, q_func, optimizer_spec, exploration, feature_tested, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete # added bool_flag for double save statistic bool_flag = False STATS_FILE_NAME = 'statistics ' + feature_tested + '.pkl' ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE if USE_CUDA: Q = q_func(num_actions=num_actions).cuda() Q_target = q_func(num_actions=num_actions).cuda() else: Q = q_func(num_actions=num_actions) Q_target = q_func(num_actions=num_actions) Q_target.load_state_dict(Q.state_dict()) # Check & load pretrained model if os.path.isfile('Q_params' + feature_tested + '.pkl'): print('Load Q parameters ...') Q.load_state_dict(torch.load('Q_params' + feature_tested + '.pkl')) if os.path.isfile('target_Q_params' + feature_tested + '.pkl'): print('Load target Q parameters ...') Q_target.load_state_dict( torch.load('target_Q_params' + feature_tested + '.pkl')) ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) Statistic = { "starting_Q_values": [], "mean_episode_rewards": [], "best_mean_episode_rewards": [] } mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') # load prev Stats start = 0 if os.path.isfile(STATS_FILE_NAME): with open(STATS_FILE_NAME, 'rb') as f: Statistic = pickle.load(f) mean_episode_reward = Statistic["mean_episode_rewards"][-1] best_mean_episode_reward = Statistic["best_mean_episode_rewards"][ -1] start = len(Statistic["mean_episode_rewards"]) print('Load %s ...' % STATS_FILE_NAME) done = False ############### # RUN ENV # ############### num_param_updates = 0 last_obs = env.reset() LOG_EVERY_N_STEPS = 50000 #10000 for t in count(start): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): return Statistic break # couldnt handle stopping_criterion, this works: if t > 4500000: return Statistic break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() action = select_epilson_greedy_action(Q, encoded_obs, t) # if started a new game - store Q-func ###### # done not initialized since last time if t > learning_starts and done: # a very expensive statistic - so don't log frequently with torch.no_grad(): obs = torch.from_numpy(encoded_obs).type(dtype).unsqueeze( 0) / 255.0 item = torch.max(Q(Variable(obs))).item() Statistic["starting_Q_values"].append(item) ###### # this steps the environment forward one step last_obs, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if done: last_obs = env.reset() ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE # 3.a sample a batch of transitions sample = replay_buffer.sample(batch_size) obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = sample # move variables to GPU if available obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype)) / 255.0 action_batch = Variable( torch.from_numpy(action_batch).type(dtype).long().view(-1, 1)) reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype)) / 255.0 done_mask = Variable(torch.from_numpy(done_mask).type(dtype)) # 3.b compute the Bellman error # evaluating the current and next Q-values state_action_values = Q(obs_batch).gather(1, action_batch) next_state_values = Q_target(next_obs_batch).detach() # maskout post terminal status Q-values masked_next_state_values = next_state_values.max(1)[0] * ( 1 - done_mask) # constructing the corresponding error expected_state_action_values = (masked_next_state_values * gamma) + reward_batch bellman_error = expected_state_action_values.unsqueeze( 1) - state_action_values # clip the error between [-1,1] clipped_bellman_error = bellman_error.clamp(-1, 1) optimizer.zero_grad() # multiply by -1 (since pytorch minimizes) state_action_values.backward(-clipped_bellman_error) # 3.c: train the model optimizer.step() # 3.d periodically update the target network num_param_updates += 1 if num_param_updates % target_update_freq == 0: Q_target.load_state_dict(Q.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Save the trained model torch.save(Q.state_dict(), 'Q_params' + feature_tested + '.pkl') torch.save(Q_target.state_dict(), 'target_Q_params' + feature_tested + '.pkl') # Dump statistics to pickle #double save if bool_flag: bool_flag = False with open(STATS_FILE_NAME, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % STATS_FILE_NAME) else: bool_flag = True with open('copy_' + STATS_FILE_NAME, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'copy_' + STATS_FILE_NAME) plt.clf() plt.xlabel('Num of Games') plt.ylabel('Q-values on starting state') plt.plot(range(len(Statistic["starting_Q_values"])), Statistic["starting_Q_values"], label='Q-values') plt.legend() plt.title(feature_tested) plt.savefig('Q-value-Performance' + feature_tested + '.png') plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Statistic["mean_episode_rewards"]) plt.plot(range(num_items), Statistic["mean_episode_rewards"], label='mean reward') plt.plot(range(num_items), Statistic["best_mean_episode_rewards"], label='best mean rewards') plt.legend() plt.title(feature_tested) plt.savefig('DeepQ-Performance' + feature_tested + '.png')
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete Statistic['parameters'] = { 'replay_buffer_size': replay_buffer_size, 'batch_size': batch_size, 'gamma': gamma, 'frame_history_len': frame_history_len, 'learning_starts': learning_starts, 'learning_freq': learning_freq, 'target_update_freq': target_update_freq, 'name': env.env.unwrapped.spec.id } ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) if USE_CUDA: Q = Q.cuda() target_Q = target_Q.cuda() ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 filename = 'statistics.pkl' # Google Drive try: import google.colab IN_COLAB = True except: IN_COLAB = False if IN_COLAB: run_in_colab_message() try: from google.colab import auth import logging from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive from oauth2client.client import GoogleCredentials logging.getLogger('googleapicliet.discovery_cache').setLevel( logging.ERROR) auth.authenticate_user() gauth = GoogleAuth() gauth.credentials = GoogleCredentials.get_application_default() drive = GoogleDrive(gauth) except: pass iter_time = time() for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs) enc_obs = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epilson_greedy_action(Q, enc_obs, t) else: action = torch.IntTensor([[random.randrange(num_actions)]]) obs, reward, done, info = env.step(action) if done: obs = env.reset() replay_buffer.store_effect(idx, action, reward, done) last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### #3.a obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255., requires_grad=True) act_batch = Variable(torch.from_numpy(act_batch).type(torch.int64)) rew_batch = Variable(torch.from_numpy(rew_batch).type(dtype), requires_grad=True) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255., requires_grad=True) done_mask = Variable(torch.from_numpy(done_mask).type(torch.int64)) if USE_CUDA: obs_batch = obs_batch.cuda() act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() next_obs_batch = next_obs_batch.cuda() done_mask = done_mask.cuda() # Q network val = Q(obs_batch).gather(dim=1, index=act_batch.unsqueeze(1)) # Q target network with torch.no_grad(): tar_val_t = target_Q(next_obs_batch).max(1)[0] tar_val = torch.addcmul(rew_batch, gamma, 1 - done_mask.type(dtype), tar_val_t) # 3.b error calculate d_error = (tar_val - val.squeeze()).clamp_(-1, 1) * -1. # d_error = torch.pow((tar_val - val.squeeze()).clamp_(-1, 1), 2) * -1. # 3.c train Q network optimizer.zero_grad() val.backward(d_error.data.unsqueeze(1)) optimizer.step() # 3.d update target network num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print(f"Iteration time:{time()-iter_time:.2f}") iter_time = time() print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle filename = f"{t}" + 'statistics.pkl' if IN_COLAB else 'statistics.pkl' with open(filename, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % filename) if IN_COLAB and t % (LOG_EVERY_N_STEPS * 10) == 0: try: stat_pkl = drive.CreateFile() stat_pkl.SetContentFile(filename) stat_pkl.Upload() print("Uploaded to drive") except Exception: print("Exception during upload to drive")