Python RNDModel.predictor Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: RNDModel

Method/Function: predictor

Examples at hotexamples.com: 3

Python RNDModel.predictor - 3 examples found. These are the top rated real world Python examples of model.RNDModel.predictor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RNDModel(6)

to(4)

predictor(3)

target(3)

Example #1

Show file

File: agents.py Project: zhangchuheng123/Reinforcement-Implementation

class RNDAgent(object):
    def __init__(self,
                 input_size,
                 output_size,
                 num_env,
                 num_step,
                 gamma,
                 lam=0.95,
                 learning_rate=1e-4,
                 ent_coef=0.01,
                 clip_grad_norm=0.5,
                 epoch=3,
                 batch_size=128,
                 ppo_eps=0.1,
                 update_proportion=0.25,
                 use_gae=True,
                 use_cuda=False,
                 use_noisy_net=False):
        self.model = CnnActorCriticNetwork(input_size, output_size,
                                           use_noisy_net)
        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.num_step = num_step
        self.gamma = gamma
        self.lam = lam
        self.epoch = epoch
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.ent_coef = ent_coef
        self.ppo_eps = ppo_eps
        self.clip_grad_norm = clip_grad_norm
        self.update_proportion = update_proportion
        self.device = torch.device('cuda' if use_cuda else 'cpu')

        self.rnd = RNDModel(input_size, output_size)
        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.rnd.predictor.parameters()),
                                    lr=learning_rate)
        self.rnd = self.rnd.to(self.device)

        self.model = self.model.to(self.device)

    def get_action(self, state):
        state = torch.Tensor(state).to(self.device)
        state = state.float()
        policy, value_ext, value_int = self.model(state)
        action_prob = F.softmax(policy, dim=-1).data.cpu().numpy()

        action = self.random_choice_prob_index(action_prob)

        return action, value_ext.data.cpu().numpy().squeeze(
        ), value_int.data.cpu().numpy().squeeze(), policy.detach()

    @staticmethod
    def random_choice_prob_index(p, axis=1):
        r = np.expand_dims(np.random.rand(p.shape[1 - axis]), axis=axis)
        return (p.cumsum(axis=axis) > r).argmax(axis=axis)

    def compute_intrinsic_reward(self, next_obs):
        next_obs = torch.FloatTensor(next_obs).to(self.device)

        target_next_feature = self.rnd.target(next_obs)
        predict_next_feature = self.rnd.predictor(next_obs)
        intrinsic_reward = (target_next_feature -
                            predict_next_feature).pow(2).sum(1) / 2

        return intrinsic_reward.data.cpu().numpy()

    def train_model(self, s_batch, target_ext_batch, target_int_batch, y_batch,
                    adv_batch, next_obs_batch, old_policy):
        s_batch = torch.FloatTensor(s_batch).to(self.device)
        target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device)
        target_int_batch = torch.FloatTensor(target_int_batch).to(self.device)
        y_batch = torch.LongTensor(y_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device)

        sample_range = np.arange(len(s_batch))
        forward_mse = nn.MSELoss(reduction='none')

        with torch.no_grad():
            policy_old_list = torch.stack(old_policy).permute(
                1, 0, 2).contiguous().view(-1,
                                           self.output_size).to(self.device)

            m_old = Categorical(F.softmax(policy_old_list, dim=-1))
            log_prob_old = m_old.log_prob(y_batch)
            # ------------------------------------------------------------

        for i in range(self.epoch):
            np.random.shuffle(sample_range)
            for j in range(int(len(s_batch) / self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size *
                                          (j + 1)]

                # --------------------------------------------------------------------------------
                # for Curiosity-driven(Random Network Distillation)
                predict_next_state_feature, target_next_state_feature = self.rnd(
                    next_obs_batch[sample_idx])

                forward_loss = forward_mse(
                    predict_next_state_feature,
                    target_next_state_feature.detach()).mean(-1)
                # Proportion of exp used for predictor update
                mask = torch.rand(len(forward_loss)).to(self.device)
                mask = (mask < self.update_proportion).type(
                    torch.FloatTensor).to(self.device)
                forward_loss = (forward_loss * mask).sum() / torch.max(
                    mask.sum(),
                    torch.Tensor([1]).to(self.device))
                # ---------------------------------------------------------------------------------

                policy, value_ext, value_int = self.model(s_batch[sample_idx])
                m = Categorical(F.softmax(policy, dim=-1))
                log_prob = m.log_prob(y_batch[sample_idx])

                ratio = torch.exp(log_prob - log_prob_old[sample_idx])

                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(ratio, 1.0 - self.ppo_eps,
                                    1.0 + self.ppo_eps) * adv_batch[sample_idx]

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_ext_loss = F.mse_loss(value_ext.sum(1),
                                             target_ext_batch[sample_idx])
                critic_int_loss = F.mse_loss(value_int.sum(1),
                                             target_int_batch[sample_idx])

                critic_loss = critic_ext_loss + critic_int_loss

                entropy = m.entropy().mean()

                self.optimizer.zero_grad()
                loss = actor_loss + 0.5 * critic_loss - self.ent_coef * entropy + forward_loss
                loss.backward()
                global_grad_norm_(
                    list(self.model.parameters()) +
                    list(self.rnd.predictor.parameters()))
                self.optimizer.step()

Example #2

Show file

class RNDAgent(object):
    def __init__(self,
                 input_size,
                 output_size,
                 num_env,
                 num_step,
                 gamma,
                 lam=0.95,
                 learning_rate=1e-4,
                 ent_coef=0.01,
                 clip_grad_norm=0.5,
                 epoch=3,
                 batch_size=128,
                 ppo_eps=0.1,
                 update_proportion=0.25,
                 use_gae=True,
                 use_cuda=False):

        # Build the PPO Model
        self.model = PPOModel(input_size, output_size)

        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.num_step = num_step
        self.gamma = gamma
        self.lam = lam
        self.epoch = epoch
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.ent_coef = ent_coef
        self.ppo_eps = ppo_eps
        self.clip_grad_norm = clip_grad_norm
        self.update_proportion = update_proportion

        self.device = torch.device('cuda' if use_cuda else 'cpu')
        print("DEVICE: ", self.device)

        # Build the RND model
        self.rnd = RNDModel(input_size, output_size)

        # Define the optimizer (Adam)
        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.rnd.predictor.parameters()),
                                    lr=learning_rate)

        # CPU/GPU
        self.rnd = self.rnd.to(self.device)

        self.model = self.model.to(self.device)

    def get_action(self, state):
        # Transform our state into a float32 tensor
        state = torch.Tensor(state).to(self.device)
        state = state.float()

        # Get the action dist, ext_value, int_value
        policy, value_ext, value_int = self.model(state)

        # Get action probability distribution
        action_prob = F.softmax(policy, dim=-1).data.cpu().numpy()

        # Select action
        action = self.random_choice_prob_index(action_prob)

        return action, value_ext.data.cpu().numpy().squeeze(
        ), value_int.data.cpu().numpy().squeeze(), policy.detach()

    @staticmethod
    def random_choice_prob_index(p, axis=1):
        r = np.expand_dims(np.random.rand(p.shape[1 - axis]), axis=axis)
        return (p.cumsum(axis=axis) > r).argmax(axis=axis)

    # Calculate Intrinsic reward (prediction error)
    def compute_intrinsic_reward(self, next_obs):
        next_obs = torch.FloatTensor(next_obs).to(self.device)

        # Get target feature
        target_next_feature = self.rnd.target(next_obs)

        # Get prediction feature
        predict_next_feature = self.rnd.predictor(next_obs)

        # Calculate intrinsic reward
        intrinsic_reward = (target_next_feature -
                            predict_next_feature).pow(2).sum(1) / 2

        return intrinsic_reward.data.cpu().numpy()

    def train_model(self, s_batch, target_ext_batch, target_int_batch, y_batch,
                    adv_batch, next_obs_batch, old_policy):
        s_batch = torch.FloatTensor(s_batch).to(self.device)
        target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device)
        target_int_batch = torch.FloatTensor(target_int_batch).to(self.device)
        y_batch = torch.LongTensor(y_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device)

        sample_range = np.arange(len(s_batch))
        forward_mse = nn.MSELoss(reduction='none')

        # Get old policy
        with torch.no_grad():
            policy_old_list = torch.stack(old_policy).permute(
                1, 0, 2).contiguous().view(-1,
                                           self.output_size).to(self.device)

            m_old = Categorical(F.softmax(policy_old_list, dim=-1))
            log_prob_old = m_old.log_prob(y_batch)
            # ------------------------------------------------------------

        for i in range(self.epoch):
            # Here we'll do minibatches of training
            np.random.shuffle(sample_range)
            for j in range(int(len(s_batch) / self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size *
                                          (j + 1)]

                # --------------------------------------------------------------------------------
                # for Curiosity-driven(Random Network Distillation)
                predict_next_state_feature, target_next_state_feature = self.rnd(
                    next_obs_batch[sample_idx])

                forward_loss = forward_mse(
                    predict_next_state_feature,
                    target_next_state_feature.detach()).mean(-1)
                # Proportion of exp used for predictor update
                mask = torch.rand(len(forward_loss)).to(self.device)
                mask = (mask < self.update_proportion).type(
                    torch.FloatTensor).to(self.device)
                forward_loss = (forward_loss * mask).sum() / torch.max(
                    mask.sum(),
                    torch.Tensor([1]).to(self.device))
                # ---------------------------------------------------------------------------------

                policy, value_ext, value_int = self.model(s_batch[sample_idx])
                m = Categorical(F.softmax(policy, dim=-1))
                log_prob = m.log_prob(y_batch[sample_idx])

                ratio = torch.exp(log_prob - log_prob_old[sample_idx])

                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(ratio, 1.0 - self.ppo_eps,
                                    1.0 + self.ppo_eps) * adv_batch[sample_idx]

                # Calculate actor loss
                # - J is equivalent to max J hence -torch
                actor_loss = -torch.min(surr1, surr2).mean()

                # Calculate critic loss
                critic_ext_loss = F.mse_loss(value_ext.sum(1),
                                             target_ext_batch[sample_idx])
                critic_int_loss = F.mse_loss(value_int.sum(1),
                                             target_int_batch[sample_idx])

                # Critic loss = critic E loss + critic I loss
                critic_loss = critic_ext_loss + critic_int_loss

                # Calculate the entropy
                # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
                entropy = m.entropy().mean()

                # Reset the gradients
                self.optimizer.zero_grad()

                # CALCULATE THE LOSS
                # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss + forward_loss
                loss = actor_loss + 0.5 * critic_loss - self.ent_coef * entropy + forward_loss

                # Backpropagation
                loss.backward()
                global_grad_norm_(
                    list(self.model.parameters()) +
                    list(self.rnd.predictor.parameters()))
                self.optimizer.step()

Example #3

Show file

class RNDagent(object):
    def __init__(self,
                 input_size,
                 output_size,
                 seed,
                 num_env,
                 pre_obs_norm_step,
                 num_step,
                 gamma=0.99,
                 gamma_int=0.99,
                 lam=0.95,
                 int_coef=1.,
                 ext_coef=2.,
                 ent_coef=0.001,
                 cliprange=0.1,
                 max_grad_norm=0.0,
                 lr=1e-4,
                 nepochs=4,
                 batch_size=128,
                 update_proportion=0.25,
                 use_gae=True):

        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.seed = np.random.seed(seed)

        self.pre_obs_norm_step = pre_obs_norm_step
        self.num_step = num_step
        self.gamma = gamma
        self.gamma_int = gamma_int
        self.lam = lam
        self.nepochs = nepochs
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.int_coef = int_coef
        self.ext_coef = ext_coef
        self.ent_coef = ent_coef
        self.cliprange = cliprange
        self.max_grad_norm = max_grad_norm
        self.update_proportion = update_proportion

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = CnnActorCritic(input_size, output_size,
                                    seed).to(self.device)
        self.rnd = RNDModel(input_size, output_size, seed).to(self.device)
        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.rnd.predictor.parameters()),
                                    lr=lr)

        self.rff_int = RewardForwardFilter(gamma)
        #self.rff_rms_int = RunningMeanStd()
        #self.obs_rms = RunningMeanStd(shape=(1,84,84))
        self.rff_rms_int = RunningMeanStd_openAI()
        self.obs_rms = RunningMeanStd_openAI(shape=(1, 84, 84))

        self.rooms = None
        self.n_rooms = []
        self.best_nrooms = -np.inf
        self.scores = []
        self.scores_window = deque(maxlen=100)

        self.stats = defaultdict(float)  # Count episodes and timesteps
        self.stats['epcount'] = 0
        self.stats['tcount'] = 0

    def collect_random_statistics(self, envs):
        """Initializes observation normalization with data from random agent."""
        all_ob = []
        all_ob.append(envs.reset())
        for _ in range(self.pre_obs_norm_step):
            actions = np.random.randint(0,
                                        self.output_size,
                                        size=(self.num_env, ))
            ob, _, _, _ = envs.step(actions)
            all_ob.append(ob)

            if len(all_ob) % (128 * self.num_env) == 0:
                ob_ = np.asarray(all_ob).astype(np.float32).reshape(
                    (-1, *envs.observation_space.shape))
                self.obs_rms.update(ob_[:, -1:, :, :])
                all_ob.clear()

    def act(self, state, action=None, calc_ent=False):
        """Returns dict of trajectory info.
        Shape
        ======
            state (uint8) : (batch_size, framestack=4, 84, 84)
        
        Returns example
            {'a': tensor([10,  5,  1]),
             'ent': None,
             'log_pi_a': tensor([-2.8904, -2.8904, -2.8904], grad_fn=<SqueezeBackward1>),
             'v_ext': tensor([0.0012, 0.0012, 0.0012], grad_fn=<SqueezeBackward0>),
             'v_int': tensor([-0.0013, -0.0013, -0.0013], grad_fn=<SqueezeBackward0>)}
        """
        #state = torch.FloatTensor(state / 255).to(self.device)
        assert state.dtype == 'uint8'
        state = torch.tensor(state / 255.,
                             dtype=torch.float,
                             device=self.device)
        #state = torch.from_numpy(state /255).float().to(self.device)

        action_probs, value_ext, value_int = self.model(state)
        dist = Categorical(action_probs)
        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy() if calc_ent else None

        return {
            'a': action,
            'log_pi_a': log_prob,
            'ent': entropy,
            'v_ext': value_ext.squeeze(),
            'v_int': value_int.squeeze()
        }

    def compute_intrinsic_reward(self, next_obs):
        """next_obs is the latest frame and must be normalized by RunningMeanStd(shape=(1, 84, 84))
        Shape
        ======
            next_obs : (batch_size, 1, 84, 84)
        """
        next_obs = torch.tensor(next_obs,
                                dtype=torch.float,
                                device=self.device)
        #next_obs = torch.FloatTensor(next_obs).to(self.device)

        target_next_feature = self.rnd.target(next_obs)
        predict_next_feature = self.rnd.predictor(next_obs)
        intrinsic_reward = (target_next_feature -
                            predict_next_feature).pow(2).mean(
                                1)  ### MSE  --- Issues
        #intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2

        return intrinsic_reward.data.cpu().numpy()

    def step(self, envs):
        """
        """
        # Step 1. n-step rollout
        next_obs_batch, int_reward_batch, state_batch, reward_batch, done_batch, action_batch, values_ext_batch, values_int_batch, log_prob_old_batch = [],[],[],[],[],[],[],[],[]
        epinfos = []

        states = envs.reset()
        for _ in range(self.num_step):

            traj_info = self.act(states)

            log_prob_old = traj_info['log_pi_a'].detach().cpu().numpy()
            actions = traj_info['a'].cpu().numpy()
            value_ext = traj_info['v_ext'].detach().cpu().numpy()
            value_int = traj_info['v_int'].detach().cpu().numpy()

            next_states, rewards, dones, infos = envs.step(actions)

            next_obs = next_states[:, -1:, :, :]
            intrinsic_reward = self.compute_intrinsic_reward(
                ((next_obs - self.obs_rms.mean) /
                 (np.sqrt(self.obs_rms.var))).clip(-5, 5))  #+1e-10

            next_obs_batch.append(next_obs)
            int_reward_batch.append(intrinsic_reward)

            state_batch.append(states)
            reward_batch.append(rewards)
            done_batch.append(dones)
            action_batch.append(actions)
            values_ext_batch.append(value_ext)
            values_int_batch.append(value_int)
            log_prob_old_batch.append(log_prob_old)

            for info in infos:
                if 'episode' in info:
                    epinfos.append(info['episode'])

            states = next_states

        # calculate last next value
        last_traj_info = self.act(states)
        values_ext_batch.append(last_traj_info['v_ext'].detach().cpu().numpy())
        values_int_batch.append(last_traj_info['v_int'].detach().cpu().numpy())

        # convert to numpy array and transpose (num_env, num_step) from (num_step, num_env) for the later calculation
        # For self.update()
        state_batch = np.stack(state_batch).transpose(1, 0, 2, 3, 4).reshape(
            -1, 4, 84, 84)
        next_obs_batch = np.stack(next_obs_batch).transpose(1, 0, 2, 3,
                                                            4).reshape(
                                                                -1, 1, 84, 84)
        action_batch = np.stack(action_batch).transpose().reshape(-1, )
        log_prob_old_batch = np.stack(log_prob_old_batch).transpose().reshape(
            -1, )

        # For get_advantage_and_value_target_from()
        reward_batch = np.stack(reward_batch).transpose()
        done_batch = np.stack(done_batch).transpose()
        values_ext_batch = np.stack(values_ext_batch).transpose()
        values_int_batch = np.stack(values_int_batch).transpose()
        # --------------------------------------------------

        # Step 2. calculate intrinsic reward
        # running estimate of the intrinsic returns
        int_reward_batch = np.stack(int_reward_batch).transpose()
        discounted_reward_per_env = np.array([
            self.rff_int.update(reward_per_step)
            for reward_per_step in int_reward_batch.T[::-1]
        ])
        mean, std, count = np.mean(discounted_reward_per_env), np.std(
            discounted_reward_per_env), len(discounted_reward_per_env)
        self.rff_rms_int.update_from_moments(mean, std**2,
                                             count)  ### THINK ddof !

        # normalize intrinsic reward
        int_reward_batch /= np.sqrt(self.rff_rms_int.var)
        # -------------------------------------------------------------------------------------------

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = get_advantage_and_value_target_from(
            reward_batch, done_batch, values_ext_batch, self.gamma, self.lam,
            self.num_step, self.num_env, self.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = get_advantage_and_value_target_from(
            int_reward_batch, np.zeros_like(int_reward_batch),
            values_int_batch, self.gamma_int, self.lam, self.num_step,
            self.num_env, self.use_gae)

        # add ext adv and int adv
        total_advs = self.int_coef * int_adv + self.ext_coef * ext_adv
        # -----------------------------------------------

        # Step 4. update obs normalize param
        self.obs_rms.update(next_obs_batch)
        # -----------------------------------------------

        # Step 5. Train
        loss_infos = self.update(
            state_batch,
            ext_target,
            int_target,
            action_batch,
            total_advs,
            ((next_obs_batch - self.obs_rms.mean) /
             (np.sqrt(self.obs_rms.var))).clip(-5, 5),  #+1e-10
            log_prob_old_batch)
        # -----------------------------------------------

        # Collects info for reporting.
        vals_info = dict(
            advextmean=ext_adv.mean(),
            retextmean=ext_target.mean(),
            advintmean=int_adv.mean(),
            retintmean=int_target.mean(),
            rewintsample=int_reward_batch[1]  # env_number = 1
        )

        # Some reporting logic
        for epinfo in epinfos:
            #if self.testing:
            #    self.I.statlists['eprew_test'].append(epinfo['r'])
            #    self.I.statlists['eplen_test'].append(epinfo['l'])
            #else:
            if "visited_rooms" in epinfo:
                self.n_rooms.append(len(epinfo["visited_rooms"]))

                if self.best_nrooms is None:
                    self.best_nrooms = len(epinfo["visited_rooms"])
                elif len(epinfo["visited_rooms"]) > self.best_nrooms:
                    self.best_nrooms = len(epinfo["visited_rooms"])
                    self.rooms = sorted(list(epinfo["visited_rooms"]))
                #self.rooms += list(epinfo["visited_rooms"])
                #self.rooms = sorted(list(set(self.rooms)))
                #self.I.statlists['eprooms'].append(len(epinfo["visited_rooms"]))
            self.scores.append(epinfo['r'])
            self.scores_window.append(epinfo['r'])
            self.stats['epcount'] += 1
            self.stats['tcount'] += epinfo['l']
            #self.I.statlists['eprew'].append(epinfo['r'])
            #self.I.statlists['eplen'].append(epinfo['l'])
            #self.stats['rewtotal'] += epinfo['r']

        return {'loss': loss_infos, 'vals': vals_info}

    def update(self, s_batch, target_ext_batch, target_int_batch, action_batch,
               adv_batch, next_obs_batch, log_prob_old_batch):
        #s_batch = torch.FloatTensor(s_batch).to(self.device)
        target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device)
        target_int_batch = torch.FloatTensor(target_int_batch).to(self.device)
        action_batch = torch.LongTensor(action_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device)
        log_prob_old_batch = torch.FloatTensor(log_prob_old_batch).to(
            self.device)

        sample_range = np.arange(len(s_batch))
        forward_mse = nn.MSELoss(reduction='none')

        loss_infos = defaultdict(list)

        for _ in range(self.nepochs):
            np.random.shuffle(sample_range)
            for j in range(int(len(s_batch) / self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size *
                                          (j + 1)]

                # --------------------------------------------------------------------------------
                # for Curiosity-driven(Random Network Distillation)
                predict_next_state_feature, target_next_state_feature = self.rnd(
                    next_obs_batch[sample_idx])
                forward_loss = forward_mse(
                    predict_next_state_feature,
                    target_next_state_feature.detach()).mean(-1)

                # Proportion of exp used for predictor update   ---  cf. cnn_policy_param_matched.py
                mask = torch.rand(len(forward_loss)).to(self.device)
                mask = (mask < self.update_proportion).float().to(self.device)
                forward_loss = (forward_loss * mask).sum() / torch.max(
                    mask.sum(),
                    torch.Tensor([1]).to(self.device))
                # ---------------------------------------------------------------------------------

                traj_info = self.act(s_batch[sample_idx],
                                     action_batch[sample_idx],
                                     calc_ent=True)

                ratio = torch.exp(traj_info['log_pi_a'] -
                                  log_prob_old_batch[sample_idx])

                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(ratio, 1.0 - self.cliprange, 1.0 +
                                    self.cliprange) * adv_batch[sample_idx]

                policy_loss = -torch.min(surr1, surr2).mean()

                critic_ext_loss = F.mse_loss(traj_info['v_ext'],
                                             target_ext_batch[sample_idx])
                critic_int_loss = F.mse_loss(traj_info['v_int'],
                                             target_int_batch[sample_idx])
                value_loss = critic_ext_loss + critic_int_loss

                entropy = traj_info['ent'].mean()

                self.optimizer.zero_grad()
                loss = policy_loss + 0.5 * value_loss - self.ent_coef * entropy + forward_loss
                loss.backward()
                if self.max_grad_norm:
                    nn.utils.clip_grad_norm_(
                        list(self.model.parameters()) +
                        list(self.rnd.predictor.parameters()),
                        self.max_grad_norm)
                self.optimizer.step()

            _data = dict(policy=policy_loss.data.cpu().numpy(),
                         value_ext=critic_ext_loss.data.cpu().numpy(),
                         value_int=critic_int_loss.data.cpu().numpy(),
                         entropy=entropy.data.cpu().numpy(),
                         forward=forward_loss.data.cpu().numpy())
            for k, v in _data.items():
                loss_infos[k].append(v)

        return loss_infos