class ExplorationOrExploitationAgent(DQNAgent): exploration_model: BaseExplorationModel def __init__(self, env, agent_params): super(ExplorationOrExploitationAgent, self).__init__(env, agent_params) self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True) self.num_exploration_steps = agent_params['num_exploration_steps'] self.offline_exploitation = agent_params['offline_exploitation'] self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec) self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec) if agent_params['use_cbe']: self.exploration_model = CountBasedModel( agent_params['cbe_coefficient'], env) else: self.exploration_model = RNDModel(agent_params, self.optimizer_spec) self.explore_weight_schedule: Schedule = agent_params[ 'explore_weight_schedule'] self.exploit_weight_schedule: Schedule = agent_params[ 'exploit_weight_schedule'] self.actor = ArgMaxPolicy(self.exploration_critic) self.eval_policy = ArgMaxPolicy(self.exploitation_critic) self.exploit_rew_shift = agent_params['exploit_rew_shift'] self.exploit_rew_scale = agent_params['exploit_rew_scale'] self.eps = agent_params['eps'] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if self.t > self.num_exploration_steps: # After exploration is over, set the actor to optimize the extrinsic critic # HINT: Look at method ArgMaxPolicy.set_critic self.actor.set_critic(self.exploitation_critic) if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): # Get Reward Weights # Get the current explore reward weight and exploit reward weight # using the schedule's passed in (see __init__) # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0 explore_weight = self.explore_weight_schedule.value(self.t) exploit_weight = self.exploit_weight_schedule.value(self.t) # Run Exploration Model # # Evaluate the exploration model on s' to get the exploration bonus # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude expl_bonus = self.exploration_model.forward_np(next_ob_no) expl_bonus = normalize( expl_bonus, expl_bonus.mean(), expl_bonus.std(), ) # Reward Calculations # # Calculate mixed rewards, which will be passed into the exploration critic # HINT: See doc for definition of mixed_reward mixed_reward = (explore_weight * expl_bonus + exploit_weight * re_n) # Calculate the environment reward # HINT: For part 1, env_reward is just 're_n' # After this, env_reward is 're_n' shifted by self.exploit_rew_shift, # and scaled by self.exploit_rew_scale env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale # Update Critics And Exploration Model # # 1): Update the exploration model (based off s') expl_model_loss = self.exploration_model.update(next_ob_no) # 2): Update the exploration critic (based off mixed_reward) exploration_critic_loss = self.exploration_critic.update( ob_no, ac_na, next_ob_no, mixed_reward, terminal_n) # 3): Update the exploitation critic (based off env_reward) exploitation_critic_loss = self.exploitation_critic.update( ob_no, ac_na, next_ob_no, env_reward, terminal_n) # Target Networks # if self.num_param_updates % self.target_update_freq == 0: self.exploration_critic.update_target_network() self.exploitation_critic.update_target_network() # Logging # log['Exploration Critic Loss'] = exploration_critic_loss[ 'Training Loss'] log['Exploitation Critic Loss'] = exploitation_critic_loss[ 'Training Loss'] log['Exploration Model Loss'] = expl_model_loss # Uncomment these lines after completing cql_critic.py log['Exploitation Data q-values'] = exploitation_critic_loss[ 'Data q-values'] log['Exploitation OOD q-values'] = exploitation_critic_loss[ 'OOD q-values'] log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss'] self.num_param_updates += 1 self.t += 1 return log def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer_idx = self.replay_buffer.store_frame( self.last_obs) perform_random_action = np.random.random( ) < self.eps or self.t < self.learning_starts if perform_random_action: action = self.env.action_space.sample() else: processed = self.replay_buffer.encode_recent_observation() action = self.actor.get_action(processed) next_obs, reward, done, info = self.env.step(int(action)) self.last_obs = next_obs.copy() if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) if done: self.last_obs = self.env.reset()
class DQNAgent(object): def __init__(self, env, agent_params): self.env = env self.agent_params = agent_params self.batch_size = agent_params['batch_size'] # import ipdb; ipdb.set_trace() self.last_obs = self.env.reset() self.num_actions = agent_params['ac_dim'] self.learning_starts = agent_params['learning_starts'] self.learning_freq = agent_params['learning_freq'] self.target_update_freq = agent_params['target_update_freq'] self.replay_buffer_idx = None self.exploration = agent_params['exploration_schedule'] self.optimizer_spec = agent_params['optimizer_spec'] self.critic = DQNCritic(agent_params, self.optimizer_spec) self.actor = ArgMaxPolicy( self.critic ) if 'topk' not in agent_params['policy'] else TopkPolicy( self.critic, agent_params['topk_policy']) lander = agent_params['env_name'].startswith('LunarLander') self.replay_buffer = MemoryOptimizedReplayBuffer( agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) self.t = 0 self.num_param_updates = 0 def add_to_replay_buffer(self, paths): pass def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ # TODO store the latest observation ("frame") into the replay buffer # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` # in dqn_utils.py self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) last_obs = self.replay_buffer.encode_recent_observation() eps = self.exploration.value(self.t) # TODO use epsilon greedy exploration when selecting action perform_random_action = (np.random.random() < eps) or self.t < self.learning_starts if perform_random_action: # HINT: take random action # with probability eps (see np.random.random()) # OR if your current step number (see self.t) is less that self.learning_starts action = np.random.randint(self.num_actions) else: # HINT: Your actor will take in multiple previous observations ("frames") in order # to deal with the partial observability of the environment. Get the most recent # `frame_history_len` observations using functionality from the replay buffer, # and then use those observations as input to your actor. action = self.actor.get_action(last_obs) # TODO take a step in the environment using the action from the policy # HINT1: remember that self.last_obs must always point to the newest/latest observation # HINT2: remember the following useful function that you've seen before: #obs, reward, done, info = env.step(action) obs, reward, done, info = self.env.step(action) self.last_obs = obs # TODO store the result of taking this action into the replay buffer # HINT1: see your replay buffer's `store_effect` function # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) # TODO if taking this step resulted in done, reset the env (and the latest observation) if done: start_obs = self.env.reset() self.last_obs = start_obs def sample(self, batch_size): if self.replay_buffer.can_sample(self.batch_size): return self.replay_buffer.sample(batch_size) else: return [], [], [], [], [] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): # TODO fill in the call to the update function using the appropriate tensors log = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # TODO update the target network periodically # HINT: your critic already has this functionality implemented if self.num_param_updates % self.target_update_freq == 0: self.critic.update_target_network() self.num_param_updates += 1 self.t += 1 return log
class ExplorationOrExploitationAgent(DQNAgent): def __init__(self, env, agent_params): super(ExplorationOrExploitationAgent, self).__init__(env, agent_params) self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True) self.num_exploration_steps = agent_params['num_exploration_steps'] self.offline_exploitation = agent_params['offline_exploitation'] self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec) self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec) self.exploration_model = RNDModel(agent_params, self.optimizer_spec) self.explore_weight_schedule = agent_params['explore_weight_schedule'] self.exploit_weight_schedule = agent_params['exploit_weight_schedule'] self.actor = ArgMaxPolicy(self.exploration_critic) self.eval_policy = ArgMaxPolicy(self.exploitation_critic) self.exploit_rew_shift = agent_params['exploit_rew_shift'] self.exploit_rew_scale = agent_params['exploit_rew_scale'] self.eps = agent_params['eps'] self.l2_info = agent_params['l2_info'] def dist2(self, x, c): """ dist2 Calculates squared distance between two sets of points. Description D = DIST2(X, C) takes two matrices of vectors and calculates the squared Euclidean distance between them. Both matrices must be of the same column dimension. If X has M rows and N columns, and C has L rows and N columns, then the result has M rows and L columns. The I, Jth entry is the squared distance from the Ith row of X to the Jth row of C. Adapted from code by Christopher M Bishop and Ian T Nabney. """ ndata, dimx = x.shape ncenters, dimc = c.shape return (np.ones((ncenters, 1)) * np.sum((x**2).T, axis=0)).T + \ np.ones(( ndata, 1)) * np.sum((c**2).T, axis=0) - \ 2 * np.inner(x, c) def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} # if len(ob_no) != 0: # print (ob_no.shape) #(256,2) if self.t > self.num_exploration_steps: # TODO: After exploration is over, set the actor to optimize the extrinsic critic #HINT: Look at method ArgMaxPolicy.set_critic self.actor.set_critic(self.exploitation_critic) if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): # Get Reward Weights # TODO: Get the current explore reward weight and exploit reward weight # using the schedule's passed in (see __init__) # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0 explore_weight = self.explore_weight_schedule.value(self.t) exploit_weight = self.exploit_weight_schedule.value(self.t) # Run Exploration Model # # TODO: Evaluate the exploration model on s' to get the exploration bonus # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude if self.l2_info: dist = self.dist2(next_ob_no, ob_no) expl_bonus = np.sum(dist, axis=1) else: expl_bonus = self.exploration_model.forward_np(next_ob_no) expl_bonus = (expl_bonus - np.mean(expl_bonus)) / np.std( expl_bonus) # TODO: Normalize # Reward Calculations # # TODO: Calculate mixed rewards, which will be passed into the exploration critic # HINT: See doc for definition of mixed_reward mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n # TODO: Calculate the environment reward # HINT: For part 1, env_reward is just 're_n' # After this, env_reward is 're_n' shifted by self.exploit_rew_shift, # and scaled by self.exploit_rew_scale env_reward = re_n env_reward = (env_reward + self.exploit_rew_shift) * self.exploit_rew_scale # Update Critics And Exploration Model # # TODO 1): Update the exploration model (based off s') # TODO 2): Update the exploration critic (based off mixed_reward) # TODO 3): Update the exploitation critic (based off env_reward) expl_model_loss = self.exploration_model.update( ptu.from_numpy(next_ob_no)) exploration_critic_loss = self.exploration_critic.update( ob_no, ac_na, next_ob_no, mixed_reward, terminal_n) exploitation_critic_loss = self.exploitation_critic.update( ob_no, ac_na, next_ob_no, env_reward, terminal_n) # Target Networks # if self.num_param_updates % self.target_update_freq == 0: # TODO: Update the exploitation and exploration target networks self.exploration_critic.update_target_network() self.exploitation_critic.update_target_network() # Logging # log['Exploration Critic Loss'] = exploration_critic_loss[ 'Training Loss'] log['Exploitation Critic Loss'] = exploitation_critic_loss[ 'Training Loss'] log['Exploration Model Loss'] = expl_model_loss # TODO: Uncomment these lines after completing cql_critic.py log['Exploitation Data q-values'] = exploitation_critic_loss[ 'Data q-values'] log['Exploitation OOD q-values'] = exploitation_critic_loss[ 'OOD q-values'] log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss'] self.num_param_updates += 1 self.t += 1 return log def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer_idx = self.replay_buffer.store_frame( self.last_obs) perform_random_action = np.random.random( ) < self.eps or self.t < self.learning_starts if perform_random_action: action = self.env.action_space.sample() else: processed = self.replay_buffer.encode_recent_observation() action = self.actor.get_action(processed) next_obs, reward, done, info = self.env.step(action) self.last_obs = next_obs.copy() if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) if done: self.last_obs = self.env.reset()
class DQNAgent(object): def __init__(self, env, agent_params): self.env = env self.agent_params = agent_params self.batch_size = agent_params['batch_size'] # import ipdb; ipdb.set_trace() self.last_obs = self.env.reset() self.num_actions = agent_params['ac_dim'] self.learning_starts = agent_params['learning_starts'] self.learning_freq = agent_params['learning_freq'] self.target_update_freq = agent_params['target_update_freq'] self.replay_buffer_idx = None self.exploration = agent_params['exploration_schedule'] self.optimizer_spec = agent_params['optimizer_spec'] self.critic = DQNCritic(agent_params, self.optimizer_spec) self.actor = ArgMaxPolicy(self.critic) lander = agent_params['env_name'].startswith('LunarLander') self.replay_buffer = MemoryOptimizedReplayBuffer( agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) self.t = 0 self.num_param_updates = 0 def add_to_replay_buffer(self, paths): pass def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ # TODO store the latest observation ("frame") into the replay buffer # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` # in dqn_utils.py self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) eps = self.exploration.value(self.t) # use epsilon greedy exploration when selecting action perform_random_action = np.random.random( ) < eps or self.t < self.learning_starts if perform_random_action: # take random action with probability eps (see np.random.random()) # OR if your current step number (see self.t) is less that self.learning_starts (start off taking random acs # before we have trained policy) action = self.env.action_space.sample() else: # Take in multiple previous observations ("frames") in order # to deal with the partial observability of the environment. frames = self.replay_buffer.encode_recent_observation() action = self.actor.get_action(frames) # Take a step in the environment using the action from the policy self.last_obs, reward, done, info = self.env.step(action) # Store the result of this action for this obs in the replay buffer self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) if done: self.last_obs = self.env.reset() def sample(self, batch_size): if self.replay_buffer.can_sample(self.batch_size): return self.replay_buffer.sample(batch_size) else: return [], [], [], [], [] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): # fill in the call to the update function using the appropriate tensors log = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # update the target network periodically if self.num_param_updates % self.target_update_freq == 0: self.critic.update_target_network() self.num_param_updates += 1 self.t += 1 return log
class DQNAgent(object): def __init__(self, env, agent_params): self.env = env self.agent_params = agent_params self.batch_size = agent_params['batch_size'] self.last_obs = self.env.reset() self.num_actions = agent_params['ac_dim'] self.learning_starts = agent_params['learning_starts'] self.learning_freq = agent_params['learning_freq'] self.target_update_freq = agent_params['target_update_freq'] self.replay_buffer_idx = None self.exploration = agent_params['exploration_schedule'] self.optimizer_spec = agent_params['optimizer_spec'] self.critic = DQNCritic(agent_params, self.optimizer_spec) self.actor = ArgMaxPolicy(self.critic, device=agent_params['device']) lander = agent_params['env_name'] == 'LunarLander-v2' self.replay_buffer = MemoryOptimizedReplayBuffer( agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) self.t = 0 self.num_param_updates = 0 def add_to_replay_buffer(self, paths): pass def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ # TODO store the latest observation into the replay buffer # HINT: see replay buffer's function store_frame self.last_obs = np.transpose(self.last_obs, (2, 0, 1)) self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) eps = self.exploration.value(self.t) # TODO use epsilon greedy exploration when selecting action # HINT: take random action # with probability eps (see np.random.random()) # OR if your current step number (see self.t) is less that self.learning_starts perform_random_action = np.random.random( ) < eps or self.t < self.learning_starts if perform_random_action: action = np.random.randint(self.num_actions) else: # TODO query the policy to select action # HINT: you cannot use "self.last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. # Check out the replay buffer, which has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. enc_last_obs = self.replay_buffer.encode_recent_observation() enc_last_obs = enc_last_obs[None, :] # TODO query the policy with enc_last_obs to select action action = self.actor.get_action(enc_last_obs.astype(np.float32)) action = action[0] # TODO take a step in the environment using the action from the policy # HINT1: remember that self.last_obs must always point to the newest/latest observation # HINT2: remember the following useful function that you've seen before: #obs, reward, done, info = env.step(action) self.last_obs, reward, done, info = self.env.step(action) # Making the last observation channel first # TODO store the result of taking this action into the replay buffer # HINT1: see replay buffer's store_effect function # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) # TODO if taking this step resulted in done, reset the env (and the latest observation) if done: self.env.reset() def sample(self, batch_size): if self.replay_buffer.can_sample(self.batch_size): return self.replay_buffer.sample(batch_size) else: return [], [], [], [], [] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): """ Here, you should train the DQN agent. This consists of training the critic, as well as periodically updating the target network. """ loss = 0.0 if (self.t > self.learning_starts and \ self.t % self.learning_freq == 0 and \ self.replay_buffer.can_sample(self.batch_size)): # TODO populate all placeholders necessary for calculating the critic's total_error # HINT: obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph feed_dict = { 'lr': self.optimizer_spec.lr_schedule.value(self.t), 'ob_no': ob_no.astype(np.float32), 'act_t_ph': ac_na.astype(np.long), 're_n': re_n, 'next_ob_no': next_ob_no.astype(np.float32), 'terminal_n': terminal_n, } # TODO: create a LIST of tensors to run in order to # train the critic as well as get the resulting total_error loss = self.critic.update(**feed_dict) # Note: remember that the critic's total_error value is what you # created to compute the Bellman error in a batch, # and the critic's train function performs a gradient step # and update the network parameters to reduce that total_error. # TODO: use sess.run to periodically update the critic's target function # HINT: see update_target_fn if self.num_param_updates % self.target_update_freq == 0: self.critic.update_target_network() self.num_param_updates += 1 self.t += 1 return loss
class ExplorationOrExploitationAgent(DQNAgent): def __init__(self, env, agent_params): super(ExplorationOrExploitationAgent, self).__init__(env, agent_params) self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True) self.num_exploration_steps = agent_params['num_exploration_steps'] self.offline_exploitation = agent_params['offline_exploitation'] self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec) self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec) self.exploration_model = RNDModel(agent_params, self.optimizer_spec) self.explore_weight_schedule = agent_params['explore_weight_schedule'] self.exploit_weight_schedule = agent_params['exploit_weight_schedule'] self.actor = ArgMaxPolicy(self.exploration_critic) self.eval_policy = ArgMaxPolicy(self.exploitation_critic) self.exploit_rew_shift = agent_params['exploit_rew_shift'] self.exploit_rew_scale = agent_params['exploit_rew_scale'] self.eps = agent_params['eps'] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if self.t > self.num_exploration_steps: self.actor.set_critic(self.exploitation_critic) if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size) ): # Get Reward Weights # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0 # explore_weight = 1 # exploit_weight = 0 explore_weight = self.explore_weight_schedule.value(self.t) exploit_weight = self.exploit_weight_schedule.value(self.t) # Run Exploration Model # expl_bonus = self.exploration_model.forward_np(next_ob_no) expl_bonus = normalize(expl_bonus, np.mean(expl_bonus), np.std(expl_bonus)) # Reward Calculations # mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale # Update Critics And Exploration Model # expl_model_loss = self.exploration_model.update(next_ob_no) exploration_critic_loss = self.exploration_critic.update(ob_no, ac_na, next_ob_no, mixed_reward, terminal_n) exploitation_critic_loss = self.exploitation_critic.update(ob_no, ac_na, next_ob_no, env_reward, terminal_n) # Target Networks # if self.num_param_updates % self.target_update_freq == 0: self.exploitation_critic.update_target_network() self.exploration_critic.update_target_network() # Logging # log['Exploration Critic Loss'] = exploration_critic_loss['Training Loss'] log['Exploitation Critic Loss'] = exploitation_critic_loss['Training Loss'] log['Exploration Model Loss'] = expl_model_loss log['Exploitation Data q-values'] = exploitation_critic_loss['Data q-values'] log['Exploitation OOD q-values'] = exploitation_critic_loss['OOD q-values'] log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss'] self.num_param_updates += 1 self.t += 1 return log def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) perform_random_action = np.random.random() < self.eps or self.t < self.learning_starts if perform_random_action: action = self.env.action_space.sample() else: processed = self.replay_buffer.encode_recent_observation() action = self.actor.get_action(processed) next_obs, reward, done, info = self.env.step(action) self.last_obs = next_obs.copy() if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) if done: self.last_obs = self.env.reset()
class DQNAgent(object): def __init__(self, env, agent_params): self.env = env self.agent_params = agent_params self.batch_size = agent_params['batch_size'] self.last_obs = self.env.reset() self.num_actions = agent_params['ac_dim'] self.learning_starts = agent_params['learning_starts'] self.learning_freq = agent_params['learning_freq'] self.target_update_freq = agent_params['target_update_freq'] self.replay_buffer_idx = None self.exploration = agent_params['exploration_schedule'] self.optimizer_spec = agent_params['optimizer_spec'] self.critic = DQNCritic(agent_params, self.optimizer_spec) self.actor = ArgMaxPolicy(self.critic) lander = agent_params['env_name'].startswith('LunarLander') self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) self.t = 0 self.num_param_updates = 0 def add_to_replay_buffer(self, paths): pass def step_env(self): """ Step the env and store the transition At the end of this block of code, the simulator should have been advanced one step, and the replay buffer should contain one more transition. Note that self.last_obs must always point to the new latest observation. """ self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) eps = self.exploration.value(self.t) perform_random_action = np.random.random() < eps or self.t < self.learning_starts if perform_random_action: action = self.env.action_space.sample() else: action = self.actor.get_action(self.replay_buffer.encode_recent_observation()) obs, rew, done, info = self.env.step(action) self.last_obs = obs self.replay_buffer.store_effect(self.replay_buffer_idx, action, rew, done) if done: self.last_obs = self.env.reset() def sample(self, batch_size): if self.replay_buffer.can_sample(self.batch_size): return self.replay_buffer.sample(batch_size) else: return [], [], [], [], [] def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): log = {} if (self.t > self.learning_starts and self.t % self.learning_freq == 0 and self.replay_buffer.can_sample(self.batch_size)): log = self.critic.update(ob_no, ac_na, re_n, next_ob_no, terminal_n) if self.num_param_updates % self.target_update_freq == 0: self.critic.update_target_network() self.num_param_updates += 1 self.t += 1 return log