Example #1
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """
        raise NotImplementedError
        # Not needed for this homework

    ####################################
    ####################################

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        raise NotImplementedError
        # Not needed for this homework

    ####################################
    ####################################
class DQNAgent(object):
    def __init__(self, env, agent_params):
        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(
            self.critic
        ) if 'topk' not in agent_params['policy'] else TopkPolicy(
            self.critic, agent_params['topk_policy'])

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)
        last_obs = self.replay_buffer.encode_recent_observation()

        eps = self.exploration.value(self.t)

        # TODO use epsilon greedy exploration when selecting action
        perform_random_action = (np.random.random() <
                                 eps) or self.t < self.learning_starts
        if perform_random_action:
            # HINT: take random action
            # with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts
            action = np.random.randint(self.num_actions)
        else:
            # HINT: Your actor will take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment. Get the most recent
            # `frame_history_len` observations using functionality from the replay buffer,
            # and then use those observations as input to your actor.
            action = self.actor.get_action(last_obs)

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        obs, reward, done, info = self.env.step(action)
        self.last_obs = obs

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see your replay buffer's `store_effect` function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done:
            start_obs = self.env.reset()
            self.last_obs = start_obs

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # TODO fill in the call to the update function using the appropriate tensors
            log = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                     terminal_n)

            # TODO update the target network periodically
            # HINT: your critic already has this functionality implemented
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
Example #3
0
class DQNAgent(object):
    def __init__(self, sess, env, agent_params):

        self.env = env
        self.sess = sess
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(sess, agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(sess, self.critic)

        lander = agent_params['env_name'] == 'LunarLander-v2'
        # self.replay_buffer = MemoryOptimizedReplayBuffer(
        #     agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            500000, agent_params['frame_history_len'], lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation into the replay buffer: DONE
        # HINT: see replay buffer's function store_frame
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)
        # TODO use epsilon greedy exploration when selecting action: DONE
        # HINT: take random action
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = self.t < self.learning_starts or np.random.random(
        ) < eps

        if perform_random_action:
            action = np.random.randint(self.num_actions)
        else:
            # TODO query the policy to select action: DONE
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames.
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs = self.replay_buffer.encode_recent_observation()
            enc_last_obs = enc_last_obs[None, :]

            # TODO query the policy with enc_last_obs to select action: DONE
            action = self.actor.get_action(enc_last_obs)
            action = action[0]

        # TODO take a step in the environment using the action from the policy: DONE
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        obs, reward, done, info = self.env.step(action)
        self.last_obs = obs

        # TODO store the result of taking this action into the replay buffer: DONE
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation): DONE
        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """

        loss = 0.0
        if (self.t > self.learning_starts and \
                self.t % self.learning_freq == 0 and \
                self.replay_buffer.can_sample(self.batch_size)):

            # TODO populate all placeholders necessary for calculating the critic's total_error: DONE
            # HINT: obs_t_ph, act_t_ph, rew_t_ph, obs_tp1_ph, done_mask_ph
            feed_dict = {
                self.critic.learning_rate:
                self.optimizer_spec.lr_schedule.value(self.t),
                self.critic.obs_t_ph:
                ob_no,
                self.critic.act_t_ph:
                ac_na,
                self.critic.rew_t_ph:
                re_n,
                self.critic.obs_tp1_ph:
                next_ob_no,
                self.critic.done_mask_ph:
                terminal_n
            }

            # TODO: create a LIST of tensors to run in order to: DONE
            # train the critic as well as get the resulting total_error
            tensors_to_run = [self.critic.total_error, self.critic.train_fn]
            loss, _ = self.sess.run(tensors_to_run, feed_dict=feed_dict)
            # Note: remember that the critic's total_error value is what you
            # created to compute the Bellman error in a batch,
            # and the critic's train function performs a gradient step
            # and update the network parameters to reduce that total_error.

            # TODO: use sess.run to periodically update the critic's target function: DONE
            # HINT: see update_target_fn
            if self.num_param_updates % self.target_update_freq == 0:
                _ = self.sess.run([self.critic.update_target_fn])

            self.num_param_updates += 1

        self.t += 1
        return loss
class DQNAgent(object):
    def __init__(self, env, agent_params):

        print(agent_params['optimizer_spec'])

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.device = agent_params['device']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic, self.device)

        lander = agent_params['env_name'] == 'LunarLander-v2'
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation into the replay buffer
        # HINT: see replay buffer's function store_frame
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)
        # TODO use epsilon greedy exploration when selecting action
        # HINT: take random action
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = (self.t < self.learning_starts) or (
            np.random.random() < eps)

        if perform_random_action:
            action = np.random.randint(self.num_actions)
        else:
            # TODO query the policy to select action
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames.
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs = self.replay_buffer.encode_recent_observation()
            enc_last_obs = torch.tensor(enc_last_obs[None, :]).to(self.device)

            # TODO query the policy with enc_last_obs to select action
            action = self.actor.get_action(enc_last_obs)

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        #obs, reward, done, info = env.step(action)
        obs, reward, done, info = self.env.step(action)

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if done:
            self.last_obs = self.env.reset()
        else:
            self.last_obs = obs

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """
        loss = 0
        if (self.t > self.learning_starts and \
                self.t % self.learning_freq == 0 and \
                self.replay_buffer.can_sample(self.batch_size)):

            # TODO populate the parameters and implement actor.update()
            loss = self.critic.update(ob_no, ac_na, re_n, next_ob_no,
                                      terminal_n)

            # TODO: load newest parameters into the target network
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.target_Q_func.load_state_dict(
                    self.critic.Q_func.state_dict())

            self.num_param_updates += 1

        self.t += 1

        return loss
Example #5
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        # import ipdb; ipdb.set_trace()
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(
            agent_params['replay_buffer_size'],
            agent_params['frame_history_len'],
            lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        # TODO store the latest observation ("frame") into the replay buffer
        # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
        # in dqn_utils.py
        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        # use epsilon greedy exploration when selecting action
        perform_random_action = np.random.random(
        ) < eps or self.t < self.learning_starts
        if perform_random_action:
            # take random action with probability eps (see np.random.random())
            # OR if your current step number (see self.t) is less that self.learning_starts (start off taking random acs
            #   before we have trained policy)
            action = self.env.action_space.sample()
        else:
            # Take in multiple previous observations ("frames") in order
            # to deal with the partial observability of the environment.
            frames = self.replay_buffer.encode_recent_observation()
            action = self.actor.get_action(frames)

        # Take a step in the environment using the action from the policy
        self.last_obs, reward, done, info = self.env.step(action)

        # Store the result of this action for this obs in the replay buffer
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)

        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            # fill in the call to the update function using the appropriate tensors
            log = self.critic.update(ob_no, ac_na, next_ob_no, re_n,
                                     terminal_n)

            # update the target network periodically
            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log
Example #6
0
class DQNAgent(object):
    def __init__(self, env, agent_params, **kwargs):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()
        self.total_episode_reward = 0.0
        self.total_episodes = []
        self.episode_num = 0

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']
        self.gamma = agent_params['gamma']

        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params)
        self.q_t_loss = tf.keras.losses.Huber()
        self.q_t_optimizer = self.optimizer_spec.constructor(clipnorm=agent_params['grad_norm_clipping'],
                                                             learning_rate=self.optimizer_spec.lr_schedule,
                                                             **self.optimizer_spec.kwargs)

        self.actor = ArgMaxPolicy(self.critic)

        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         obs_dtype=agent_params['obs_dtype'])
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):

        """
            Step the env and store the transition

            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.

            Note that self.last_obs must always point to the new latest observation.
        """

        eps = self.exploration(self.t)
        # TODO use epsilon greedy exploration when selecting action
        # HINT: take random action 
        # with probability eps (see np.random.random())
        # OR if your current step number (see self.t) is less that self.learning_starts
        perform_random_action = random.random() < eps

        if perform_random_action:
            action = random.randrange(self.num_actions)
        else:
            # TODO query the policy to select action
            # HINT: you cannot use "self.last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames. 
            # Check out the replay buffer, which has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            enc_last_obs = self.replay_buffer.encode_next_frame_observation(self.last_obs)[np.newaxis, ...]

            # TODO query the policy with enc_last_obs to select action
            action = self.actor.get_action(enc_last_obs).numpy().item()

        # TODO take a step in the environment using the action from the policy
        # HINT1: remember that self.last_obs must always point to the newest/latest observation
        # HINT2: remember the following useful function that you've seen before:
        # obs, reward, done, info = env.step(action)
        prev_obs = self.last_obs
        self.last_obs, reward, env_done, _ = self.env.step(action)
        self.total_episode_reward += reward

        # TODO store the result of taking this action into the replay buffer
        # HINT1: see replay buffer's store_effect function
        # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
        self.replay_buffer.store_step(prev_obs, action, reward, env_done)

        # TODO if taking this step resulted in done, reset the env (and the latest observation)
        if env_done:
            self.episode_num += 1
            print('Total episode {}: {}'.format(self.episode_num, self.total_episode_reward))
            self.last_obs = self.env.reset()
            self.total_episodes.append(self.total_episode_reward)
            self.total_episode_reward = 0.0

    def sample(self, batch_size):
        return None, None, None, None, None

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):

        """
            Here, you should train the DQN agent.
            This consists of training the critic, as well as periodically updating the target network.
        """

        loss = 0.0
        if ((self.t > self.learning_starts) and (self.t % self.learning_freq == 0) and (
        self.replay_buffer.can_sample(self.batch_size))):
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = (
                tf.convert_to_tensor(x) for x in self.replay_buffer.sample(self.batch_size))
            next_state_target_q_a = self.critic.q_t_target(next_obs_batch)

            with tf.GradientTape() as tape:
                if self.critic.double_q:
                    next_state_q_a = self.critic.q_t_model(next_obs_batch)
                    next_actions = tf.argmax(next_state_q_a, axis=1)
                else:
                    next_actions = tf.argmax(next_state_target_q_a, axis=1)
                next_state_actions_mask = tf.one_hot(next_actions, depth=self.num_actions)
                q_target = rew_batch + self.gamma * tf.reduce_sum(
                    next_state_target_q_a * next_state_actions_mask, axis=1) * (1.0 - done_mask)
                q_target = tf.stop_gradient(q_target)
                current_state_q_a = self.critic.q_t_model(obs_batch)
                pred_q = tf.reduce_sum(current_state_q_a * tf.one_hot(act_batch, depth=self.num_actions), axis=1)
                loss_value = self.q_t_loss(q_target, pred_q)

            trainable_vars = self.critic.q_t_model.trainable_variables
            grads = tape.gradient(loss_value, trainable_vars)
            self.q_t_optimizer.apply_gradients(zip(grads, trainable_vars))
            self.num_param_updates += 1
            loss = loss_value.numpy().item()

            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.q_t_target.set_weights(self.critic.q_t_model.get_weights())

        self.t += 1
        return loss
Example #7
0
class DQNAgent(object):
    def __init__(self, env, agent_params):

        self.env = env
        self.agent_params = agent_params
        self.batch_size = agent_params['batch_size']
        self.last_obs = self.env.reset()

        self.num_actions = agent_params['ac_dim']
        self.learning_starts = agent_params['learning_starts']
        self.learning_freq = agent_params['learning_freq']
        self.target_update_freq = agent_params['target_update_freq']

        self.replay_buffer_idx = None
        self.exploration = agent_params['exploration_schedule']
        self.optimizer_spec = agent_params['optimizer_spec']

        self.critic = DQNCritic(agent_params, self.optimizer_spec)
        self.actor = ArgMaxPolicy(self.critic)

        lander = agent_params['env_name'].startswith('LunarLander')
        self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'],
                                                         agent_params['frame_history_len'],
                                                         lander=lander)
        self.t = 0
        self.num_param_updates = 0

    def add_to_replay_buffer(self, paths):
        pass

    def step_env(self):
        """
            Step the env and store the transition
            At the end of this block of code, the simulator should have been
            advanced one step, and the replay buffer should contain one more transition.
            Note that self.last_obs must always point to the new latest observation.
        """

        self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs)

        eps = self.exploration.value(self.t)

        perform_random_action = np.random.random() < eps or self.t < self.learning_starts
        if perform_random_action:
            action = self.env.action_space.sample()
        else:
            action = self.actor.get_action(self.replay_buffer.encode_recent_observation())

        obs, rew, done, info = self.env.step(action)
        self.last_obs = obs

        self.replay_buffer.store_effect(self.replay_buffer_idx, action, rew, done)

        if done:
            self.last_obs = self.env.reset()

    def sample(self, batch_size):
        if self.replay_buffer.can_sample(self.batch_size):
            return self.replay_buffer.sample(batch_size)
        else:
            return [], [], [], [], []

    def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
        log = {}
        if (self.t > self.learning_starts and self.t % self.learning_freq == 0
                and self.replay_buffer.can_sample(self.batch_size)):

            log = self.critic.update(ob_no, ac_na, re_n, next_ob_no, terminal_n)

            if self.num_param_updates % self.target_update_freq == 0:
                self.critic.update_target_network()

            self.num_param_updates += 1

        self.t += 1
        return log