def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data.unsqueeze(1)) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing( env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, num_actions1=31, num_actions2=27 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############### # BUILD MODEL # ############### img_h, img_w, img_c = 32, 120, 1 input_arg = frame_history_len * img_c # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) # Use volatile = True if variable is only used in inference mode, i.e. don't save the history out1, out2 = model(Variable(obs)) out1 = out1.max(1)[1].data.cpu().numpy()[0] out2 = out2.max(1)[1].data.cpu().numpy()[0] return out1, out2 else: return random.randrange(num_actions1), random.randrange(num_actions2) # Initialize target q function and q function Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype) target_Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 epoch_reward = [] for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action1, action2 = select_epilson_greedy_action(Q, recent_observations, t) else: action1, action2 = random.randrange(num_actions1), random.randrange(num_actions2) # Advance one step obs, reward, done = env.step(action1, action2) epoch_reward.append(reward) if done: env.render() # clip rewards between -1 and 1 # reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action1, action2, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() print np.mean(epoch_reward) epoch_reward = [] torch.save(Q,'../../weights/Q' + str(num_actions1) + '.pt') torch.save(target_Q,'../../weights/target_Q' + str(num_actions1) + '.pt') last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act1_batch, act2_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype)) act1_batch = Variable(torch.from_numpy(act1_batch).long()) act2_batch = Variable(torch.from_numpy(act2_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act1_batch = act1_batch.cuda() act2_batch = act2_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only stateif stopping_criterion is not None and stopping_criterion(env): # break and output value for every state-action pair # We choose Q based on action taken. q1, q2 = Q(obs_batch) current_Q1_values = q1.gather(1, act1_batch.unsqueeze(1)) current_Q2_values = q2.gather(1, act2_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated tq1, tq2 = target_Q(next_obs_batch) next_max_q1 = tq1.detach().max(1)[0] next_max_q2 = tq2.detach().max(1)[0] next_Q1_values = not_done_mask * next_max_q1 next_Q2_values = not_done_mask * next_max_q2 # Compute the target of the current Q values target_Q1_values = rew_batch + (gamma * next_Q1_values) target_Q2_values = rew_batch + (gamma * next_Q2_values) # Compute Bellman error bellman_error1 = target_Q1_values.unsqueeze(1) - current_Q1_values bellman_error2 = target_Q2_values.unsqueeze(1) - current_Q2_values bellman_error = bellman_error1 + bellman_error2 # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values = current_Q1_values + current_Q2_values current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict())
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE Q = q_func(input_arg, num_actions) Q_target = q_func(input_arg, num_actions) ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() if (t > learning_starts): action = select_epilson_greedy_action(Q, encoded_obs, t) else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if (done): last_obs = env.reset() else: last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE # # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main. # Move to GPU if possible # done flag in loop ---- SKIPPED IF DONE IS TRUE # clipping the error between -1 and 1 -- OK # backward the error meaning? # Suggestion for changing parameters - change exploration scehdule (main) # # Q.cuda() obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size=batch_size) states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) actions = Variable(torch.from_numpy(act_batch).long()) rewards = Variable(torch.from_numpy(reward_batch).float()) next_states = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype)) if USE_CUDA: states = states.cuda() actions = actions.cuda() rewards = rewards.cuda() next_states = next_states.cuda() Q.train() Q_target.eval() predicted_rewards = Q(states).gather(1, actions.unsqueeze(1)) #Q(s,a) next_max_Q = Q_target(next_states).detach().max(1)[ 0] #.unsqueeze(1) #Q_target(s,a) next_Q_values = not_dones * next_max_Q target_Q_values = rewards + (gamma * next_Q_values) #r + Q_target bellman_error = target_Q_values - predicted_rewards.squeeze(1) clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0) optimizer.zero_grad() predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1)) optimizer.step() num_param_updates += +1 if (num_param_updates % target_update_freq == 0): Q_target.load_state_dict(Q.state_dict()) # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask): # if(done == 1.0): # continue # obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True) # next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False) # current_Q = Q(obs) # predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True) # target_reward = Q_target(next_obs).data.max(1)[0] # loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0) # optimizer.zero_grad() # # should be current.backward(d_error.data.unsqueeze(1)) # # but it crashes on misfitting dims # predicted_reward.backward(loss.data.unsqueeze(1)) # optimizer.step() ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing( env, q_func, checkpoint_path, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. dont save the history return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) # optionally resume from a checkpoint if checkpoint_path: if os.path.isfile(checkpoint_path): print("=> loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) Q.load_state_dict(checkpoint['model_state_dict']) target_Q.load_state_dict(checkpoint['target_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}')".format(checkpoint_path)) else: print("=> no checkpoint found at '{}'".format(checkpoint_path)) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_EVERY_N_STEPS = 1000 episode_reward = 0 episode_rewards = [] for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0][0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) print("reward: %f" % reward) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: episode_reward = 0 obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() # squeeze the [batch_size x 1] Tensor to have a shape of batch_size # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # # Compute Bellman error # bellman_error = target_Q_values - current_Q_values # # clip the bellman error between [-1 , 1] # clipped_bellman_error = bellman_error.clamp(-1, 1) # # Note: clipped_bellman_delta * -1 will be right gradient # d_error = clipped_bellman_error * -1.0 # Compute Huber loss. Why not MSE? Because, Huber Loss is robust to noisy Q estimates compared to plain MSE. loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) loss.backward() # Clip the gradients to lie between -1 and +1 for params in Q.parameters(): params.grad.data.clamp_(-1, 1) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_reward += reward # episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() episode_rewards.append(episode_reward) if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t,)) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl') ### 5. Save a checkpoint if t % SAVE_EVERY_N_STEPS == 0 and t > learning_starts: save_checkpoint({ 'epoch': t + 1, 'model_state_dict': Q.state_dict(), 'target_state_dict': target_Q.state_dict(), 'optimizer' : optimizer.state_dict(), }, "checkpoints/checkpoint.%d.tar" % t)
def dqn_learning( env, method, game, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, double=False, dueling=False, logdir=None, svrl=False, me_type=None, maskp=None, maskstep=None, maskscheduler=True ): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n def select_epsilon_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].view(1, 1) else: return torch.IntTensor([[random.randrange(num_actions)]]) Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_MODEL_EVERY_N_STEPS = 1000000 mask_scheduler_step = (1 - maskp) / maskstep for t in count(): if stopping_criterion is not None and stopping_criterion(env): break ################ # STEP THE ENV # ################ last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epsilon_greedy_action(Q, recent_observations, t)[0][0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ################ # TRAINING # ################ if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # mask scheduler if maskscheduler: maskp = min(maskp + mask_scheduler_step, 1) obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() target_q_mat = target_Q(next_obs_batch).detach() # SV-RL scheme if svrl: target_q_mat = globals()[me_type](target_q_mat, target_q_mat.size(0), target_q_mat.size(1), maskp) if not double: next_max_q = target_q_mat.max(1)[0] else: q_temp = Q(next_obs_batch).detach() act_temp = np.argmax(q_temp.cpu(), axis=1) next_max_q = torch.sum(torch.from_numpy(np.eye(num_actions)[act_temp]).type(dtype) * target_q_mat.type(dtype), dim=1) next_Q_values = not_done_mask * next_max_q.type(dtype) target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values) optimizer.zero_grad() loss.backward() for params in Q.parameters(): params.grad.data.clamp_(-1, 1) optimizer.step() num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ################ # LOG PROGRESS # ################ # save model if t % SAVE_MODEL_EVERY_N_STEPS == 0: if not os.path.exists("models"): os.makedirs("models") add_str = 'single' if double: add_str = 'double' if dueling: add_str = 'dueling' model_save_path = 'models/%s_%s_%s.ckpt' % (str(game[:-14]), add_str, method) torch.save(Q.state_dict(), model_save_path) # log process episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: logz.log_tabular('Timestep', t) logz.log_tabular('MeanReward100Episodes', mean_episode_reward) logz.log_tabular('BestMeanReward', best_mean_episode_reward) logz.log_tabular('Episodes', len(episode_rewards)) logz.log_tabular('Exploration', exploration.value(t)) logz.dump_tabular() sys.stdout.flush()
def dqn_learing( env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000 ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.size # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return torch.IntTensor([[model(Variable(obs)).data.max(1)[1].cpu()]]) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 writer = SummaryWriter() for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) # ### 4. Log progress and keep track of statistics episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if len(episode_rewards) > 0: writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards)) writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards)) if len(episode_rewards) > 100: writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards)) #LOG 저장 if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t,)) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() torch.save(Q, 'DQN_net1029.pt') writer.close()
obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA:
def dqn_learing( #env, q_func, optimizer_spec, exploration, #stopping_criterion=None, replay_buffer_size=1000, batch_size=32, gamma=0.99, learning_starts=1, learning_freq=4, frame_history_len=1, target_update_freq=10000): #our own code read_image() rgb_data = depth_data.reshape(640, 480, 1) input_arg = rgb_data #input for the algorithm num_actions = 5 last_obs = rgb_data # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(1, num_actions).type(dtype) target_Q = q_func(1, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(1000, 1) ############### # RUN ENV # ############### num_param_updates = 0 for t in count(): last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step control_robot(action + 1) rgb_data = depth_data.reshape(640, 480, 1) obs = rgb_data ##evaluate the action dis_data = np.array(depth_data) dis_data[np.isnan(dis_data)] = 999999999999 dis_data[dis_data == 0] = 999999999999 dis = np.min(dis_data) print("MIN DISTANCE:" + str(dis) + "-------------") reward = 0 if dis < 500: reward = 1 else: reward = -1 print("REWARD:" + str(reward) + "--------------") # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, False) # Resets the environment when reaching an episode boundary. #if done: #obs = env.reset() last_obs = obs if (t > 1 and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): print("Training") obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather( 1, act_batch.unsqueeze(1)).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) print("next:", next_Q_values.shape) print("current:", current_Q_values.squeeze().shape) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error #.clamp(-1, 1) #print(clipped_bellman_error) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass #print(d_error.data) optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict())
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own conv-net using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of choseing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: print(env.observation_space.shape) img_h, img_w, img_c = env.observation_space.shape # input_arg = frame_history_len * img_c input_arg = frame_history_len * 1 num_actions = env.action_space.n print(env.action_space) print(f"({input_arg}): ({img_h}X{img_w}X{img_c})") # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): values = model(Variable(obs)) return values.data.max(1)[1].cpu().unsqueeze(dim=1) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() obs = cv.cvtColor(last_obs, cv.COLOR_BGR2GRAY) obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2)) last_obs = obs[..., np.newaxis] print("Q model:") summary(Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1])) print("Q-TARGET model:") summary(target_Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1])) LOG_EVERY_N_STEPS = 10000 rewards = 0. out_count = 0 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break if t % 1e3 == 0: if out_count == 0: stdout.write("|") out_count += 1 elif out_count % 10 == 0: stdout.write(f"{out_count}|") out_count += 1 elif out_count >= 50: stdout.write("=> \n") out_count = 0 else: stdout.write(".") out_count += 1 stdout.flush() ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: values = select_epilson_greedy_action(Q, recent_observations, t) action = values[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) rewards += reward # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() print(len(episode_rewards), episode_rewards, rewards) rewards = 0. # print(obs.shape) # cv.imshow('now_color', obs) # cv.waitKey(1) obs = cv.cvtColor(obs, cv.COLOR_BGR2GRAY) obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2)) obs = obs[..., np.newaxis] # cv.imshow('now', obs) # cv.waitKey(1) last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. values = Q(obs_batch) current_Q_values = values.gather(1, act_batch.unsqueeze(1)).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (gamma * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, statistics_file_name="statistics.pkl"): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network statistics_file_name: str Where to store the statistics file """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete print("STATISTICS_FILE_NAME: {}".format(statistics_file_name)) ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type( torch_types.FloatTensor).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return random.randrange(num_actions) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE policy_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target_net = q_func(input_arg, num_actions).to(device).type( torch_types.FloatTensor) # Q target target_net.load_state_dict( policy_net.state_dict()) # copies the state of policy Q into target ###### # Construct policy_net network optimizer function optimizer = optimizer_spec.constructor(policy_net.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE stored_frame_idx = replay_buffer.store_frame(last_obs) last_obs_encoded = replay_buffer.encode_recent_observation() action = select_epilson_greedy_action(policy_net, last_obs_encoded, t) obs, reward, done, info = env.step(action) replay_buffer.store_effect(stored_frame_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE sample = replay_buffer.sample(batch_size) obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample # convert batches to pytorch tensors: obs_batch = torch.from_numpy(obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type( torch_types.FloatTensor) / 255.0 actions_batch = torch.from_numpy(actions_batch).to(device).type( torch_types.LongTensor) rewards_batch = torch.from_numpy(rewards_batch).to(device).type( torch_types.FloatTensor) non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type( torch_types.FloatTensor) # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html: # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = policy_net(obs_batch).gather( 1, actions_batch.unsqueeze(1)).squeeze(1) # Compute V(s_{t+1}) for all next states. next_state_values = target_net(next_obs_batch).max( 1)[0].detach() * non_final_mask # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + rewards_batch # Compute loss d_error = state_action_values - expected_state_action_values # = -bellman_error d_error.clamp_(-1, 1) # Optimize the model optimizer.zero_grad() state_action_values.backward(d_error) optimizer.step() num_param_updates += 1 # Periodically update target network: if num_param_updates % target_update_freq == 0: target_net.load_state_dict(policy_net.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts: print("Timestep %d" % (t, )) print(" mean reward (100 episodes) %f" % mean_episode_reward) print(" best mean reward %f" % best_mean_episode_reward) print(" episodes %d" % len(episode_rewards)) print(" exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open(statistics_file_name, 'wb') as f: pickle.dump(Statistic, f)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete if not os.path.isdir("./models"): os.mkdir("./models") if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): ret = model(obs).data.max(1)[1].cpu() return ret else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') save_best_mean_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 20000 SAVE_EVERY_N_STEPS = 2000000 AL_ALPHA = 0.7 for t in count(): if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() cur_all_Q_values = Q(obs_batch) action_gap = cur_all_Q_values.max( dim=1)[0] * cur_all_Q_values.size(1) - cur_all_Q_values.sum( dim=1) Statistic["mean_action_gap"].append(action_gap.mean().item()) current_Q_values = cur_all_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() next_target_Q_values = target_Q(next_obs_batch).detach() next_max_q = next_target_Q_values.max(1)[0] next_Q_values = not_done_mask * next_max_q target_Q_values = rew_batch + (gamma * next_Q_values) bellman_error = target_Q_values - current_Q_values cur_target_Q_values = target_Q(obs_batch).detach() cur_advantage = cur_target_Q_values.max( dim=1)[0] - cur_target_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() next_advantage = next_target_Q_values.max( dim=1)[0] - next_target_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() # Set up the error according to the operator you want al_error = bellman_error - AL_ALPHA * cur_advantage persistent_error = bellman_error - AL_ALPHA * next_advantage pal_error = torch.max(al_error, persistent_error) error = pal_error # use whichever you want clipped_bellman_error = error.clamp(-1, 1) d_error = clipped_bellman_error * -1.0 optimizer.zero_grad() current_Q_values.backward(d_error.data) optimizer.step() num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ## Log Progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % './models/statistics.pkl') if save_best_mean_reward < best_mean_episode_reward: save_best_mean_reward = best_mean_episode_reward torch.save(Q.state_dict(), './models/best_model.pth') if t % SAVE_EVERY_N_STEPS == 0: torch.save(Q.state_dict(), './models/n_steps_%d.pth' % t)
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ ############################ # BUILD MODEL / 모델 만들기 # ############################ # Observation 크기에 따라 Q function에 들어갈 input_arg 설정 if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c # Simulator에서의 action의 갯수 받아옴 num_actions = env.action_space.size # Construct an epilson greedy policy with given exploration schedule # Epsilon greedy 함수 설정 ## random으로 뽑은 sample값과 dqn learning의 exploration schedule과 비교, 결과에 맞게 epsilon greedy action policy를 줌 def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) # if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return torch.IntTensor( [[model(Variable(obs)).data.max(1)[1].cpu()]]) else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function # Q function과 target Q function 생성 Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function # Optimizer_spec을 이용하여 optimizer function을 만듦 optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer # ReplayBuffer을 이용해 replay_buffer 생성 replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### #초기값 설정 num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 # TensorboardX를 모니터함 writer = SummaryWriter() # t가 0부터 loop이 돌 때 마다 하나씩 커짐. 몇 번의 iteration이 실행되었는지 확인 가능 for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done # 가장 최근의 결과 이미지가 replay_buffer에 저장되고 그때의 action, reward, 그리고 끝남의 여부가 last_idx에 저장됨 last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. #replay_buffer에 저장된 buffer중 가장 최근 것을 불러 직전의 frame들과 비교, Q network에 들어갈 input을 계산 recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning # t가 learning_starts보다 크다면, 즉 충분한 iteration이 진행되었다면 action을 random값이 아닌 learning에 의한 값으로 받음 if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step # action을 취하고 그에 따른 결과 이미지 (obs), 보상 (reward), 끝남 여부 (done)을 저장, replay_buffer에도 넣어줌 obs, reward, done = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. # 끝이 났다면 env, 즉 학습 환경도 다시 리셋함 if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken ## 충분한 iteration으로 t가 learning_starts보다 크고, ## learning_freq의 주기와 맞고, ## buffer의 사이즈가 batch 사이즈와 비교해 충분 할 때, learning이 시작됨 if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target #replay_buffer에서 batch size에 맞는 데이터 양을 불러온다 obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation # model의 input에 맞게 numpy array에서 torch로 변환 obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. # 현재의 Q value를 계산 current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated # 어떤 action이 max Q value를 주는지에 따라 다음 Q value를 설정 next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values # 현재의 targer Q value를 optimize와 backward를 이용하여 계산 target_Q_values = rew_batch + (gamma * next_Q_values) loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() # Perfom the update # 업데이트 후 업데이트 횟수도 업데이트 optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network # targer 업데이트 주기에 맞을때 마다 target network를 업데이트 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) # ### 4. Log progress and keep track of statistics # episode reward 출력, 100번이 넘어가면 최고평균값도 평균값과 함께 출력 episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) # Tensorboard에 저장 if len(episode_rewards) > 0: writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards)) writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards)) if len(episode_rewards) > 100: writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards)) # learning된 내용 출력 if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() torch.save(Q, 'DQN_net1029.pt') ## 적당한 파일에 저장 writer.close()
def dqn_learn(env, q_func, optimizer_spec, exploration, stopping_criterion, replay_buffer_size, batch_size, gamma, learning_starts, learning_freq, frame_history_len, target_update_freq, grad_norm_clipping, double_q): """Implements DQN training Parameters ---------- env : gym.Env OpenAI gym environment q_func : torch.nn.Module DQN that computes q-values for each action: (state) -> (q-value, action) optimizer_spec : OptimizerSpec parameters for the optimizer exploration : Schedule schedule for epsilon-greedy exploration stopping_criterion : func when to stop training: (env, num_timesteps) -> bool replay_buffer_size : int experience replay memory size batch_size : int batch size to sample from replay memory gamma : float discount factor learning_starts : int number of environment steps before starting the training process learning_freq : int number of environment steps between updating DQN weights frame_history_len : int number of previous frames to include as DQN input target_update_freq : int number of experience replay steps to update the target network grad_norm_clipping : float maximum size of gradients to clip to double_q : bool enable double DQN learning """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete def select_action(dqn, obs, t): """Implements epsilon-greedy exploration Parameters ---------- dqn : torch.nn.Module DQN model obs : np.ndarray Stacked input frames to evaluate t : int Current time step Returns ------- nd.array (1,1) action to take """ threshold = exploration.value(t) if random.random() > threshold: # take optimal action obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # DQN returns (q-value, action) q_values = dqn(obs) # returns (max, argmax) of q-values (max q-value, action which produces max q-value) _, action = q_values.data.max(1) else: # take a random action action = torch.IntTensor([random.randrange(num_actions)]) return action # get input sizes and num actions img_h, img_w, img_c = env.observation_space.shape in_channels = frame_history_len * img_c input_shape = (img_h, img_w, in_channels) num_actions = env.action_space.n # construct online and target DQNs online_DQN = q_func(in_channels=in_channels, num_actions=num_actions) target_DQN = q_func(in_channels=in_channels, num_actions=num_actions) # construct optimizer optimizer = optimizer_spec.constructor(online_DQN.parameters(), **optimizer_spec.kwargs) # construct replay memory replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) # initialize main loop variables num_param_updates = 0 avg_episode_reward = float('-inf') best_avg_episode_reward = float('-inf') cumulative_avg_episode_reward = float('-inf') prev_obs = env.reset() # main training loop for t in count(): # check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break # store transition and concatenate last frames last_idx = replay_buffer.store_frame(prev_obs) # stack previous frames into a tensor to give to DQN stacked_obs = replay_buffer.encode_recent_observation() # take random actions until we've officially started training if t > learning_starts: # select action according to epsilon-greedy action = select_action(online_DQN, stacked_obs, t)[0] else: # take a random action action = random.randrange(num_actions) # step environment obs, reward, done, _ = env.step(action) # clip reward reward = min(-1.0, max(reward, 1.0)) # store effect of taking action in prev_obs into replay memory replay_buffer.store_effect(last_idx, action, reward, done) # if game is finished, reset environment if done: obs = env.reset() prev_obs = obs # experience replay if t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample( batch_size): # sample batches obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0 action_batch = torch.from_numpy(action_batch).long() reward_batch = torch.from_numpy(reward_batch) next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).type(dtype) if torch.cuda.is_available(): action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() # Compute current q-values: Q(s, a) # Select q-values based on actions we would have taken for each state # shape: (BATCH_SIZE, 1) current_q_values = online_DQN(obs_batch).gather( 1, action_batch.unsqueeze(1)) # double DQN or vanilla DQN if double_q: # compute which actions to take according to online network: argmax_a Q(s', a) greedy_actions = online_DQN(next_obs_batch).detach().max(1)[1] # compute q-values of those actions using target network: Q_hat(s', argmax_a Q(s', a)) next_q_values = target_DQN(next_obs_batch).gather( 1, greedy_actions.unsqueeze(1)) else: # Compute next q-values using target network next_q_values = target_DQN(next_obs_batch).detach().max(1)[0] next_q_values = next_q_values.unsqueeze(1) # apply mask to retain q-values next_q_values = not_done_mask.unsqueeze(1) * next_q_values """ Compute the target q-values (BATCH_SIZE, 1) y_j = r_j + gamma * max_a' Q(s', a') for vanilla DQN y_j = r_j + gamma * Q_hat(s', argmax_a Q(s', a)) for double DQN """ target_q_values = reward_batch + (gamma * next_q_values) """ Use the huber loss instead of clipping the TD error. Huber loss intuitively means we assign a much larger loss where the error is large (quadratic) Smaller errors equate to smaller losses (linear) """ loss = F.smooth_l1_loss(current_q_values, target_q_values) # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass loss.backward() # clip gradients nn.utils.clip_grad_norm_(online_DQN.parameters(), grad_norm_clipping) # update weights of dqn optimizer.step() num_param_updates += 1 # update target network weights if num_param_updates % target_update_freq == 0: target_DQN.load_state_dict(online_DQN.state_dict()) # end experience replay # log progress so far by averaging last 100 episodes episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: avg_episode_reward = np.mean(episode_rewards[-100:]) cumulative_avg_episode_reward = np.mean(episode_rewards) if len(episode_rewards) > 100: best_avg_episode_reward = max(best_avg_episode_reward, avg_episode_reward) if t % LOG_FREQ == 0 and t > learning_starts: print('-' * 64) print('Timestep {}'.format(t)) print( 'Average reward (100 episodes): {}'.format(avg_episode_reward)) print('Best average reward: {}'.format(best_avg_episode_reward)) print('Cumulative average reward: {}'.format( cumulative_avg_episode_reward)) print('Episode {}'.format(len(episode_rewards))) print('Exploration {}'.format(exploration.value(t))) print('\n') sys.stdout.flush()
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): print("running new version") """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. """ ---------------------------- OUR CODE ---------------------------- """ Q = q_func(input_arg, num_actions) # The parameters are random Qtag = q_func(input_arg, num_actions) if (USE_CUDA): Q.cuda() Qtag.cuda() Qtag.load_state_dict(Q.state_dict()) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) """ ------------------------------------------------------------------ """ ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() reward = None done = None info = None LOG_EVERY_N_STEPS = 10000 startTime = time.time() for t in count(): """ Tsuf: ---- Stuff for debigging times for various places --- """ T1 = 0 t1Tmp = 0 T2 = 0 t2Tmp = 0 T3 = 0 t3Tmp = 0 T4 = 0 t4Tmp = 0 T5 = 0 t5Tmp = 0 T6 = 0 t6Tmp = 0 T7 = 0 t7Tmp = 0 T8 = 0 t8Tmp = 0 """ ----------------------------------------------------------- """ ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break #if (t>1000000): # break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) """ -------------------------- OUR CODE -------------------------- """ #store last_obs, and get latest obs's as the input for the n.n t1Tmp = time.time() cur_idx = replay_buffer.store_frame(last_obs) next_input = replay_buffer.encode_recent_observation() T1 += time.time() - t1Tmp #take random action or use the net t2Tmp = time.time() action = select_epilson_greedy_action( Q, next_input, t) #the returned action is on the CPU T2 += time.time() - t2Tmp #see what happens after we take that action t3Tmp = time.time() last_obs, reward, done, info = env.step( action) #the returned parameters are on the CPU T3 += time.time() - t3Tmp # print(t) # env.render() #store the results on the replay buffer replay_buffer.store_effect(cur_idx, action, reward, done) #on the CPU #if the simulation is done, reset the environment if (done): last_obs = env.reset() """ -------------------------------------------------------------- """ # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) """ ------------------------ OUR CODE ------------------------ """ #sample a batch of history samples t4Tmp = time.time() obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) #on CPU obs_batch = torch.from_numpy(obs_batch).type( dtype) / 255.0 # When available, move the samples batch to GPU next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 #GPU T4 += time.time() - t4Tmp #see which Q values the current network gives, for all obs's t5Tmp = time.time() inter_Qs = Q( Variable(obs_batch)) #input is on GPU, output is on GPU inter_Qs_chosen = Variable( torch.zeros(batch_size).type(dtype)) #GPU #take the action that was chosen before for i in range(batch_size): inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]] #take only the intermediate (non-terminal) obs's inter_idx = np.where(done_mask == False)[0] #CPU inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :] T5 += time.time() - t5Tmp #see what the "target" (backuped) network says for the intermediate ones t6Tmp = time.time() inter_next_Qs = Qtag( Variable(inter_next_obs_batch, volatile=True)).data.max(1)[0] #All on GPU T6 += time.time() - t6Tmp #calculate the bellman errors t7Tmp = time.time() #for final obs's, the target is just the reward targets = torch.from_numpy(rew_batch).type( dtype) #Moved rew_batch to GPU (as 'targets') for (i, idx) in enumerate(inter_idx): targets[idx] += gamma * inter_next_Qs[i] #The bellman item # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!! [on GPU] # for i in range(len(errors)): # if errors[i]<-1: # errors[i] = -1 # elif errors[i]>1: # errors[i] = 1 errors = inter_Qs_chosen.data - targets errors.clamp(-1, 1) T7 += time.time() - t7Tmp #train the network! (: t8Tmp = time.time() optimizer.zero_grad() inter_Qs_chosen.backward( errors) #COULD BE WRONG WAY!! [Everything is on GPU (: ] optimizer.step() T8 += time.time() - t8Tmp num_param_updates += 1 if (num_param_updates % target_update_freq == 0): Qtag.load_state_dict(Q.state_dict()) """ ---------------------------------------------------------- """ ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) Statistic["running_times"].append(int(time.time() - startTime)) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: if (PRINT_TIMES): print("-----------------------") print(T1) print(T2) print(T3) print(T4) print(T5) print(T6) print(T7) print(T8) print("-----------------------") print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'statistics.pkl')
def dqn_learing( env, q_func, optimizer_spec, exploration, feature_tested, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, ): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete # added bool_flag for double save statistic bool_flag = False STATS_FILE_NAME = 'statistics ' + feature_tested + '.pkl' ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### # YOUR CODE HERE if USE_CUDA: Q = q_func(num_actions=num_actions).cuda() Q_target = q_func(num_actions=num_actions).cuda() else: Q = q_func(num_actions=num_actions) Q_target = q_func(num_actions=num_actions) Q_target.load_state_dict(Q.state_dict()) # Check & load pretrained model if os.path.isfile('Q_params' + feature_tested + '.pkl'): print('Load Q parameters ...') Q.load_state_dict(torch.load('Q_params' + feature_tested + '.pkl')) if os.path.isfile('target_Q_params' + feature_tested + '.pkl'): print('Load target Q parameters ...') Q_target.load_state_dict( torch.load('target_Q_params' + feature_tested + '.pkl')) ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) Statistic = { "starting_Q_values": [], "mean_episode_rewards": [], "best_mean_episode_rewards": [] } mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') # load prev Stats start = 0 if os.path.isfile(STATS_FILE_NAME): with open(STATS_FILE_NAME, 'rb') as f: Statistic = pickle.load(f) mean_episode_reward = Statistic["mean_episode_rewards"][-1] best_mean_episode_reward = Statistic["best_mean_episode_rewards"][ -1] start = len(Statistic["mean_episode_rewards"]) print('Load %s ...' % STATS_FILE_NAME) done = False ############### # RUN ENV # ############### num_param_updates = 0 last_obs = env.reset() LOG_EVERY_N_STEPS = 50000 #10000 for t in count(start): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): return Statistic break # couldnt handle stopping_criterion, this works: if t > 4500000: return Statistic break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE idx = replay_buffer.store_frame(last_obs) encoded_obs = replay_buffer.encode_recent_observation() action = select_epilson_greedy_action(Q, encoded_obs, t) # if started a new game - store Q-func ###### # done not initialized since last time if t > learning_starts and done: # a very expensive statistic - so don't log frequently with torch.no_grad(): obs = torch.from_numpy(encoded_obs).type(dtype).unsqueeze( 0) / 255.0 item = torch.max(Q(Variable(obs))).item() Statistic["starting_Q_values"].append(item) ###### # this steps the environment forward one step last_obs, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if done: last_obs = env.reset() ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE # 3.a sample a batch of transitions sample = replay_buffer.sample(batch_size) obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = sample # move variables to GPU if available obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype)) / 255.0 action_batch = Variable( torch.from_numpy(action_batch).type(dtype).long().view(-1, 1)) reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype)) / 255.0 done_mask = Variable(torch.from_numpy(done_mask).type(dtype)) # 3.b compute the Bellman error # evaluating the current and next Q-values state_action_values = Q(obs_batch).gather(1, action_batch) next_state_values = Q_target(next_obs_batch).detach() # maskout post terminal status Q-values masked_next_state_values = next_state_values.max(1)[0] * ( 1 - done_mask) # constructing the corresponding error expected_state_action_values = (masked_next_state_values * gamma) + reward_batch bellman_error = expected_state_action_values.unsqueeze( 1) - state_action_values # clip the error between [-1,1] clipped_bellman_error = bellman_error.clamp(-1, 1) optimizer.zero_grad() # multiply by -1 (since pytorch minimizes) state_action_values.backward(-clipped_bellman_error) # 3.c: train the model optimizer.step() # 3.d periodically update the target network num_param_updates += 1 if num_param_updates % target_update_freq == 0: Q_target.load_state_dict(Q.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Save the trained model torch.save(Q.state_dict(), 'Q_params' + feature_tested + '.pkl') torch.save(Q_target.state_dict(), 'target_Q_params' + feature_tested + '.pkl') # Dump statistics to pickle #double save if bool_flag: bool_flag = False with open(STATS_FILE_NAME, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % STATS_FILE_NAME) else: bool_flag = True with open('copy_' + STATS_FILE_NAME, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % 'copy_' + STATS_FILE_NAME) plt.clf() plt.xlabel('Num of Games') plt.ylabel('Q-values on starting state') plt.plot(range(len(Statistic["starting_Q_values"])), Statistic["starting_Q_values"], label='Q-values') plt.legend() plt.title(feature_tested) plt.savefig('Q-value-Performance' + feature_tested + '.png') plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Statistic["mean_episode_rewards"]) plt.plot(range(num_items), Statistic["mean_episode_rewards"], label='mean reward') plt.plot(range(num_items), Statistic["best_mean_episode_rewards"], label='best mean rewards') plt.legend() plt.title(feature_tested) plt.savefig('DeepQ-Performance' + feature_tested + '.png')
def dqn_learing(env, q_func, optimizer_spec, exploration, stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete Statistic['parameters'] = { 'replay_buffer_size': replay_buffer_size, 'batch_size': batch_size, 'gamma': gamma, 'frame_history_len': frame_history_len, 'learning_starts': learning_starts, 'learning_freq': learning_freq, 'target_update_freq': target_update_freq, 'name': env.env.unwrapped.spec.id } ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function, i.e. build the model. ###### Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) if USE_CUDA: Q = Q.cuda() target_Q = target_Q.cuda() ###### # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 filename = 'statistics.pkl' # Google Drive try: import google.colab IN_COLAB = True except: IN_COLAB = False if IN_COLAB: run_in_colab_message() try: from google.colab import auth import logging from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive from oauth2client.client import GoogleCredentials logging.getLogger('googleapicliet.discovery_cache').setLevel( logging.ERROR) auth.authenticate_user() gauth = GoogleAuth() gauth.credentials = GoogleCredentials.get_application_default() drive = GoogleDrive(gauth) except: pass iter_time = time() for t in count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs) enc_obs = replay_buffer.encode_recent_observation() if t > learning_starts: action = select_epilson_greedy_action(Q, enc_obs, t) else: action = torch.IntTensor([[random.randrange(num_actions)]]) obs, reward, done, info = env.step(action) if done: obs = env.reset() replay_buffer.store_effect(idx, action, reward, done) last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # Note: Move the variables to the GPU if avialable # 3.b: fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and # maskout post terminal status Q-values (see ReplayBuffer code). # 3.c: train the model. To do this, use the bellman error you calculated perviously. # Pytorch will differentiate this error for you, to backward the error use the following API: # current.backward(d_error.data.unsqueeze(1)) # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error. # Your code should produce one scalar-valued tensor. # Note: don't forget to call optimizer.zero_grad() before the backward call and # optimizer.step() after the backward call. # 3.d: periodically update the target network by loading the current Q network weights into the # target_Q network. see state_dict() and load_state_dict() methods. # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### #3.a obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255., requires_grad=True) act_batch = Variable(torch.from_numpy(act_batch).type(torch.int64)) rew_batch = Variable(torch.from_numpy(rew_batch).type(dtype), requires_grad=True) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255., requires_grad=True) done_mask = Variable(torch.from_numpy(done_mask).type(torch.int64)) if USE_CUDA: obs_batch = obs_batch.cuda() act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() next_obs_batch = next_obs_batch.cuda() done_mask = done_mask.cuda() # Q network val = Q(obs_batch).gather(dim=1, index=act_batch.unsqueeze(1)) # Q target network with torch.no_grad(): tar_val_t = target_Q(next_obs_batch).max(1)[0] tar_val = torch.addcmul(rew_batch, gamma, 1 - done_mask.type(dtype), tar_val_t) # 3.b error calculate d_error = (tar_val - val.squeeze()).clamp_(-1, 1) * -1. # d_error = torch.pow((tar_val - val.squeeze()).clamp_(-1, 1), 2) * -1. # 3.c train Q network optimizer.zero_grad() val.backward(d_error.data.unsqueeze(1)) optimizer.step() # 3.d update target network num_param_updates += 1 if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ##### ### 4. Log progress and keep track of statistics episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print(f"Iteration time:{time()-iter_time:.2f}") iter_time = time() print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle filename = f"{t}" + 'statistics.pkl' if IN_COLAB else 'statistics.pkl' with open(filename, 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % filename) if IN_COLAB and t % (LOG_EVERY_N_STEPS * 10) == 0: try: stat_pkl = drive.CreateFile() stat_pkl.SetContentFile(filename) stat_pkl.Upload() print("Uploaded to drive") except Exception: print("Exception during upload to drive")