class ExplorationOrExploitationAgent(DQNAgent):
    exploration_model: BaseExplorationModel

    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)

        if agent_params['use_cbe']:
            self.exploration_model = CountBasedModel(
                agent_params['cbe_coefficient'], env)
        else:
            self.exploration_model = RNDModel(agent_params,
                                              self.optimizer_spec)

        self.explore_weight_schedule: Schedule = agent_params[
            'explore_weight_schedule']
        self.exploit_weight_schedule: Schedule = agent_params[
            'exploit_weight_schedule']

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}

        if self.t > self.num_exploration_steps:
            # After exploration is over, set the actor to optimize the extrinsic critic
            # HINT: Look at method ArgMaxPolicy.set_critic
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):
            # Get Reward Weights
            # Get the current explore reward weight and exploit reward weight
            #       using the schedule's passed in (see __init__)
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            # Evaluate the exploration model on s' to get the exploration bonus
            # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude
            expl_bonus = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = normalize(
                expl_bonus,
                expl_bonus.mean(),
                expl_bonus.std(),
            )

            # Reward Calculations #
            # Calculate mixed rewards, which will be passed into the exploration critic
            # HINT: See doc for definition of mixed_reward
            mixed_reward = (explore_weight * expl_bonus +
                            exploit_weight * re_n)

            # Calculate the environment reward
            # HINT: For part 1, env_reward is just 're_n'
            #       After this, env_reward is 're_n' shifted by self.exploit_rew_shift,
            #       and scaled by self.exploit_rew_scale
            env_reward = (re_n +
                          self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #

            # 1): Update the exploration model (based off s')
            expl_model_loss = self.exploration_model.update(next_ob_no)
            # 2): Update the exploration critic (based off mixed_reward)
            exploration_critic_loss = self.exploration_critic.update(
                ob_no, ac_na, next_ob_no, mixed_reward, terminal_n)
            # 3): Update the exploitation critic (based off env_reward)
            exploitation_critic_loss = self.exploitation_critic.update(
                ob_no, ac_na, next_ob_no, env_reward, terminal_n)

            # Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                self.exploration_critic.update_target_network()
                self.exploitation_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss[
                'Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss[
                'Training Loss']
            log['Exploration Model Loss'] = expl_model_loss

            # Uncomment these lines after completing cql_critic.py
            log['Exploitation Data q-values'] = exploitation_critic_loss[
                'Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss[
                'OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        if (not self.offline_exploitation) or (self.t <=
                                               self.num_exploration_steps):
            self.replay_buffer_idx = self.replay_buffer.store_frame(
                self.last_obs)

        perform_random_action = np.random.random(
        ) < self.eps or self.t < self.learning_starts

        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            processed = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(processed)

        next_obs, reward, done, info = self.env.step(int(action))
        self.last_obs = next_obs.copy()

        if (not self.offline_exploitation) or (self.t <=
                                               self.num_exploration_steps):
            self.replay_buffer.store_effect(self.replay_buffer_idx, action,
                                            reward, done)

        if done:
            self.last_obs = self.env.reset()
Ejemplo n.º 2
0
class DQNAgent(object):
    def __init__(self, env, agent_params):
        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(
            self.critic
        ) if 'topk' not in agent_params['policy'] else TopkPolicy(
            self.critic, agent_params['topk_policy'])

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)
        last_obs = self.replay_buffer.encode_recent_observation()

        eps = self.exploration.value(self.t)

        # TODO use epsilon greedy exploration when selecting action
        perform_random_action = (np.random.random() <
                                 eps) or self.t < self.learning_starts
        if perform_random_action:
            # HINT: take random action
            # with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts
            action = np.random.randint(self.num_actions)
        else:
            # HINT: Your actor will take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment. Get the most recent
            # `frame_history_len` observations using functionality from the replay buffer,
            # and then use those observations as input to your actor.
            action = self.actor.get_action(last_obs)

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        obs, reward, done, info = self.env.step(action)
        self.last_obs = obs

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see your replay buffer's `store_effect` function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done:
            start_obs = self.env.reset()
            self.last_obs = start_obs

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # TODO fill in the call to the update function using the appropriate tensors
            log = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                     terminal_n)

            # TODO update the target network periodically
            # HINT: your critic already has this functionality implemented
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
Ejemplo n.º 3
0
class ExplorationOrExploitationAgent(DQNAgent):
    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)

        self.replay_buffer = MemoryOptimizedReplayBuffer(100000,
                                                         1,
                                                         float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)

        self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']

        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']
        self.l2_info = agent_params['l2_info']

    def dist2(self, x, c):
        """
        dist2  Calculates squared distance between two sets of points.

        Description
        D = DIST2(X, C) takes two matrices of vectors and calculates the
        squared Euclidean distance between them.  Both matrices must be of
        the same column dimension.  If X has M rows and N columns, and C has
        L rows and N columns, then the result has M rows and L columns.  The
        I, Jth entry is the  squared distance from the Ith row of X to the
        Jth row of C.

        Adapted from code by Christopher M Bishop and Ian T Nabney.
        """
        ndata, dimx = x.shape
        ncenters, dimc = c.shape
        return (np.ones((ncenters, 1)) * np.sum((x**2).T, axis=0)).T + \
                np.ones((   ndata, 1)) * np.sum((c**2).T, axis=0)    - \
                2 * np.inner(x, c)

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        # if len(ob_no) != 0:
        #     print (ob_no.shape) #(256,2)

        if self.t > self.num_exploration_steps:
            # TODO: After exploration is over, set the actor to optimize the extrinsic critic
            #HINT: Look at method ArgMaxPolicy.set_critic
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # Get Reward Weights
            # TODO: Get the current explore reward weight and exploit reward weight
            #       using the schedule's passed in (see __init__)
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            # TODO: Evaluate the exploration model on s' to get the exploration bonus
            # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude
            if self.l2_info:
                dist = self.dist2(next_ob_no, ob_no)
                expl_bonus = np.sum(dist, axis=1)
            else:
                expl_bonus = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = (expl_bonus - np.mean(expl_bonus)) / np.std(
                expl_bonus)  # TODO: Normalize

            # Reward Calculations #
            # TODO: Calculate mixed rewards, which will be passed into the exploration critic
            # HINT: See doc for definition of mixed_reward
            mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n

            # TODO: Calculate the environment reward
            # HINT: For part 1, env_reward is just 're_n'
            #       After this, env_reward is 're_n' shifted by self.exploit_rew_shift,
            #       and scaled by self.exploit_rew_scale
            env_reward = re_n
            env_reward = (env_reward +
                          self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #

            # TODO 1): Update the exploration model (based off s')
            # TODO 2): Update the exploration critic (based off mixed_reward)
            # TODO 3): Update the exploitation critic (based off env_reward)
            expl_model_loss = self.exploration_model.update(
                ptu.from_numpy(next_ob_no))
            exploration_critic_loss = self.exploration_critic.update(
                ob_no, ac_na, next_ob_no, mixed_reward, terminal_n)
            exploitation_critic_loss = self.exploitation_critic.update(
                ob_no, ac_na, next_ob_no, env_reward, terminal_n)

            # Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                # TODO: Update the exploitation and exploration target networks
                self.exploration_critic.update_target_network()
                self.exploitation_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss[
                'Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss[
                'Training Loss']
            log['Exploration Model Loss'] = expl_model_loss

            # TODO: Uncomment these lines after completing cql_critic.py
            log['Exploitation Data q-values'] = exploitation_critic_loss[
                'Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss[
                'OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        if (not self.offline_exploitation) or (self.t <=
                                               self.num_exploration_steps):
            self.replay_buffer_idx = self.replay_buffer.store_frame(
                self.last_obs)

        perform_random_action = np.random.random(
        ) < self.eps or self.t < self.learning_starts

        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            processed = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(processed)

        next_obs, reward, done, info = self.env.step(action)
        self.last_obs = next_obs.copy()

        if (not self.offline_exploitation) or (self.t <=
                                               self.num_exploration_steps):
            self.replay_buffer.store_effect(self.replay_buffer_idx, action,
                                            reward, done)

        if done:
            self.last_obs = self.env.reset()
Ejemplo n.º 4
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        # use epsilon greedy exploration when selecting action
        perform_random_action = np.random.random(
        ) < eps or self.t < self.learning_starts
        if perform_random_action:
            # take random action with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts (start off taking random acs
            #   before we have trained policy)
            action = self.env.action_space.sample()
        else:
            # Take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment.
            frames = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(frames)

        # Take a step in the environment using the action from the policy
        self.last_obs, reward, done, info = self.env.step(action)

        # Store the result of this action for this obs in the replay buffer
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # fill in the call to the update function using the appropriate tensors
            log = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                     terminal_n)

            # update the target network periodically
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
Ejemplo n.º 5
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic, device=agent_params['device'])

        lander = agent_params['env_name'] == 'LunarLander-v2'
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation into the replay buffer
        # HINT: see replay buffer's function store_frame
        self.last_obs = np.transpose(self.last_obs, (2, 0, 1))
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)
        # TODO use epsilon greedy exploration when selecting action
        # HINT: take random action
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = np.random.random(
        ) < eps or self.t < self.learning_starts

        if perform_random_action:
            action = np.random.randint(self.num_actions)
        else:
            # TODO query the policy to select action
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames.
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs = self.replay_buffer.encode_recent_observation()
            enc_last_obs = enc_last_obs[None, :]

            # TODO query the policy with enc_last_obs to select action
            action = self.actor.get_action(enc_last_obs.astype(np.float32))
            action = action[0]

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        self.last_obs, reward, done, info = self.env.step(action)
        # Making the last observation channel first

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done: self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """

        loss = 0.0
        if (self.t > self.learning_starts and \
                self.t % self.learning_freq == 0 and \
                self.replay_buffer.can_sample(self.batch_size)):

            # TODO populate all placeholders necessary for calculating the critic's total_error
            # HINT: obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph

            feed_dict = {
                'lr': self.optimizer_spec.lr_schedule.value(self.t),
                'ob_no': ob_no.astype(np.float32),
                'act_t_ph': ac_na.astype(np.long),
                're_n': re_n,
                'next_ob_no': next_ob_no.astype(np.float32),
                'terminal_n': terminal_n,
            }

            # TODO: create a LIST of tensors to run in order to
            # train the critic as well as get the resulting total_error
            loss = self.critic.update(**feed_dict)

            # Note: remember that the critic's total_error value is what you
            # created to compute the Bellman error in a batch,
            # and the critic's train function performs a gradient step
            # and update the network parameters to reduce that total_error.

            # TODO: use sess.run to periodically update the critic's target function
            # HINT: see update_target_fn
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return loss
Ejemplo n.º 6
0
class ExplorationOrExploitationAgent(DQNAgent):
    def __init__(self, env, agent_params):
        super(ExplorationOrExploitationAgent, self).__init__(env, agent_params)
        
        self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True)
        self.num_exploration_steps = agent_params['num_exploration_steps']
        self.offline_exploitation = agent_params['offline_exploitation']

        self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec)
        self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec)
        
        self.exploration_model = RNDModel(agent_params, self.optimizer_spec)
        self.explore_weight_schedule = agent_params['explore_weight_schedule']
        self.exploit_weight_schedule = agent_params['exploit_weight_schedule']
        
        self.actor = ArgMaxPolicy(self.exploration_critic)
        self.eval_policy = ArgMaxPolicy(self.exploitation_critic)
        self.exploit_rew_shift = agent_params['exploit_rew_shift']
        self.exploit_rew_scale = agent_params['exploit_rew_scale']
        self.eps = agent_params['eps']

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}

        if self.t > self.num_exploration_steps:
            self.actor.set_critic(self.exploitation_critic)

        if (self.t > self.learning_starts
                and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)
        ):

            # Get Reward Weights
            # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0
            # explore_weight = 1
            # exploit_weight = 0
            explore_weight = self.explore_weight_schedule.value(self.t)
            exploit_weight = self.exploit_weight_schedule.value(self.t)

            # Run Exploration Model #
            expl_bonus = self.exploration_model.forward_np(next_ob_no)
            expl_bonus = normalize(expl_bonus, np.mean(expl_bonus), np.std(expl_bonus))

            # Reward Calculations #
            mixed_reward = explore_weight * expl_bonus + exploit_weight * re_n
            env_reward = (re_n + self.exploit_rew_shift) * self.exploit_rew_scale

            # Update Critics And Exploration Model #
            expl_model_loss = self.exploration_model.update(next_ob_no)
            exploration_critic_loss = self.exploration_critic.update(ob_no, ac_na, next_ob_no,
                                                                     mixed_reward, terminal_n)
            exploitation_critic_loss = self.exploitation_critic.update(ob_no, ac_na, next_ob_no,
                                                                       env_reward, terminal_n)

            # Target Networks #
            if self.num_param_updates % self.target_update_freq == 0:
                self.exploitation_critic.update_target_network()
                self.exploration_critic.update_target_network()

            # Logging #
            log['Exploration Critic Loss'] = exploration_critic_loss['Training Loss']
            log['Exploitation Critic Loss'] = exploitation_critic_loss['Training Loss']
            log['Exploration Model Loss'] = expl_model_loss
            log['Exploitation Data q-values'] = exploitation_critic_loss['Data q-values']
            log['Exploitation OOD q-values'] = exploitation_critic_loss['OOD q-values']
            log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss']

            self.num_param_updates += 1

        self.t += 1
        return log


    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        perform_random_action = np.random.random() < self.eps or self.t < self.learning_starts

        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            processed = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(processed)

        next_obs, reward, done, info = self.env.step(action)
        self.last_obs = next_obs.copy()

        if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps):
            self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done)

        if done:
            self.last_obs = self.env.reset()
Ejemplo n.º 7
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        perform_random_action = np.random.random() < eps or self.t < self.learning_starts
        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            action = self.actor.get_action(self.replay_buffer.encode_recent_observation())

        obs, rew, done, info = self.env.step(action)
        self.last_obs = obs

        self.replay_buffer.store_effect(self.replay_buffer_idx, action, rew, done)

        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            log = self.critic.update(ob_no, ac_na, re_n, next_ob_no, terminal_n)

            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log