def __init__(self, env,  hidden=64):

		super().__init__()

		input_size   = env.observation_space.shape[0]
		output_size  = env.action_space.shape[0]

		self.low = Tensor(env.action_space.low)
		self.high = Tensor(env.action_space.high)

		self.action_1 = nn.Linear(input_size, hidden)
		self.action_2 = nn.Linear(hidden, hidden)
		self.mean = nn.Linear(hidden, output_size)

		self.value_1 = nn.Linear(input_size, hidden)
		self.value_2 = nn.Linear(hidden, hidden)
		self.value = nn.Linear(hidden, 1)

		for name, para in self.named_parameters():
			if "weight" in name:
				nn.init.kaiming_normal_( para , mode='fan_out', nonlinearity='tanh')
			else:
				para.data.fill_( 0 )

		self.log_std = nn.Parameter(torch.zeros(1, output_size))
		
		self.policy_params = [self.action_1, self.action_2, self.mean, self.log_std]
    def _generate_one_episode(self, env, model):
        """
        generate one episode data and save them on memory
        """
        total_reward = 0

        observations, actions, rewards, values = [], [], [], []

        observation = env.reset()

        current_time_step = 0

        while current_time_step <= self.max_episode_time_step:

            observations.append(observation)

            with torch.no_grad():
                observation_tensor = Tensor(observation).unsqueeze(0)
                probs, value = model(observation_tensor)

            probs = F.softmax(probs, dim=1)
            act_dis = Categorical(probs)

            try:
                action = act_dis.sample()
            except RuntimeError:
                print(probs)
                probs, value = model(observation_tensor)
                print(probs)

            action = action.cpu().numpy()
            actions.append(action)

            observation, reward, done, _ = env.step(action[0])

            values.append(value.item())
            rewards.append(reward)
            total_reward += reward
            if done:
                break

            current_time_step += 1

        last_value = 0
        if not done:
            observation_tensor = Tensor(observation).unsqueeze(0)
            with torch.no_grad():
                _, last_value = model(observation_tensor)
            last_value = last_value.item()

        advantages, estimate_returens = self.reward_processor(
            rewards, values, last_value)

        return observations, actions, advantages, estimate_returens, total_reward, current_time_step
    def _generate_one_episode(env, model, horizon, reward_processor):
        """
        generate one episode data and save them on memory
        """
        total_reward = 0

        observations, actions, rewards, values = [], [], [], []

        observation = env.reset()

        current_time_step = 0

        while current_time_step <= horizon:

            observations.append(observation)

            with torch.no_grad():
                observation_tensor = Tensor(observation).unsqueeze(0)
                mean, std, value = model(observation_tensor)

            act_dis = Normal(mean, std)
            action = act_dis.sample()
            action = action.squeeze(0).cpu().numpy()
            actions.append(action)

            observation, reward, done, _ = env.step(action)
            # print(reward)
            values.append(value.item())
            rewards.append(reward)
            total_reward += reward
            if done:
                break

            current_time_step += 1

        last_value = 0
        if not done:
            observation_tensor = Tensor(observation).unsqueeze(0)
            with torch.no_grad():
                _, _, last_value = model(observation_tensor)
            last_value = last_value.item()

        advantages, estimate_returens = reward_processor(
            rewards, values, last_value)

        return (observations, actions, advantages, estimate_returens,
                total_reward, current_time_step)
Esempio n. 4
0
    def _optimize(self, obs, acts, advs, est_rs):

        self.optim.zero_grad()

        obs = Tensor(obs)
        acts = Tensor(acts)
        advs = Tensor(advs).unsqueeze(1)
        est_rs = Tensor(est_rs).unsqueeze(1)

        if self.continuous:
            mean, std, values = self.model(obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

        else:

            probs, values = self.model(obs)

            dis = F.softmax(probs, dim=1)

            acts = acts.long()

            dis = Categorical(probs)

            log_prob = dis.log_prob(acts).unsqueeze(1)

            ent = dis.entropy()

        # Normalize the advantage
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)

        policy_loss = -log_prob * advs
        policy_loss = policy_loss.mean() - self.entropy_para * ent.mean()

        criterion = nn.MSELoss()
        critic_loss = criterion(values, est_rs)

        self.writer.add_scalar("Training/Critic_Loss", critic_loss.item(),
                               self.step_count)
        loss = policy_loss + self.value_loss_coeff * critic_loss

        loss.backward()

        self.optim.step()

        self.step_count += 1
Esempio n. 5
0
 def sample_action_from_policy(self, observation):
     """
 Given an observation, return the action sampled from the policy model as well as the probabilities associated with each action
 """
     observation_tensor = Tensor(observation).unsqueeze(0)
     probabilities = self.policy_model(
         Variable(observation_tensor, requires_grad=True))
     action = probabilities.multinomial(1)
     return action, probabilities
Esempio n. 6
0
 def mean_kl_divergence(self, model):
   """
   Returns an estimate of the average KL divergence between a given model and self.policy_model
   """
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   actprob = model(observations_tensor).detach() + 1e-8
   old_actprob = self.policy_model(observations_tensor)
   return torch.sum(old_actprob * torch.log(old_actprob / actprob), 1).mean()
Esempio n. 7
0
 def surrogate_loss(self, theta):
   """
   Returns the surrogate loss w.r.t. the given parameter vector theta
   """
   new_model = copy.deepcopy(self.policy_model)
   vector_to_parameters(theta, new_model.parameters())
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   prob_new = new_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data
   prob_old = self.policy_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data + 1e-8
   return -torch.mean((prob_new / prob_old) * self.advantage)
    def _optimize(self, obs, acts, rews):

        self.optim.zero_grad()

        obs = Tensor(obs)
        acts = Tensor(acts)
        rews = Tensor(rews).unsqueeze(1)

        if self.continuous:
            mean, std = self.model(obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

        else:

            probs = self.model(obs)

            dis = F.softmax(probs, dim=1)
            dis = Categorical(dis)

            acts = acts.long()

            log_prob = dis.log_prob(acts)
            ent = dis.entropy()

        rews = (rews - rews.mean()) / (rews.std() + 1e-8)

        actor_loss = -log_prob * rews

        actor_loss = actor_loss.mean() - self.entropy_para * ent.mean()

        actor_loss.backward()

        self.optim.step()
    def _optimize(self, obs, acts, advs, est_rs):

        self.obs, self.acts, self.advs, self.est_rs = obs, acts, advs, est_rs

        self.obs = Tensor(self.obs)
        self.acts = Tensor(self.acts)
        self.advs = Tensor(self.advs).unsqueeze(1)
        self.est_rs = Tensor(self.est_rs).unsqueeze(1)

        # Calculate Advantage & Normalize it
        self.advs = (self.advs - self.advs.mean()) / (self.advs.std() + 1e-8)

        # Surrogate loss with Entropy

        if self.continuous:
            mean, std, values = self.model(self.obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(self.acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

            probs_new = torch.exp(log_prob)
            probs_old = probs_new.detach() + 1e-8

        else:

            probs, values = self.model(self.obs)

            dis = F.softmax(probs, dim=1)

            self.acts = self.acts.long()

            probs_new = dis.gather(1, self.acts)
            probs_old = probs_new + 1e-8

            ent = -(dis.log() * dis).sum(-1)

        ratio = probs_new / probs_old

        surrogate_loss = -torch.mean(
            ratio * self.advs) - self.entropy_para * ent.mean()

        # criterion = torch.nn.MSELoss()
        # empty_value_loss = criterion( values, values.detach() )

        # Calculate the gradient of the surrogate loss
        self.model.zero_grad()
        surrogate_loss.backward()
        policy_gradient = parameters_to_vector([
            p.grad for p in self.model.policy_parameters()
        ]).squeeze(0).detach()

        # ensure gradient is not zero
        if policy_gradient.nonzero().size()[0]:
            # Use Conjugate gradient to calculate step direction
            step_direction = self.conjugate_gradient(-policy_gradient)
            # line search for step
            shs = .5 * step_direction.dot(
                self.hessian_vector_product(step_direction))

            lm = torch.sqrt(shs / self.max_kl)
            fullstep = step_direction / lm

            gdotstepdir = -policy_gradient.dot(step_direction)
            theta = self.linesearch(
                parameters_to_vector(self.model.policy_parameters()).detach(),
                fullstep, gdotstepdir / lm)
            # Update parameters of policy model
            old_model = copy.deepcopy(self.model)
            old_model.load_state_dict(self.model.state_dict())

            if any(np.isnan(theta.cpu().detach().numpy())):
                print("NaN detected. Skipping update...")
            else:
                # for param in self.model.policy_parameters():
                #     print(param)
                vector_to_parameters(theta, self.model.policy_parameters())

            kl_old_new = self.mean_kl_divergence(old_model)
            print('KL:{:10} , Entropy:{:10}'.format(kl_old_new.item(),
                                                    ent.mean().item()))

        else:
            print("Policy gradient is 0. Skipping update...")
            print(policy_gradient.shape)

        self.model.zero_grad()

        if self.continuous:
            _, _, values = self.model(self.obs)
        else:
            _, values = self.model(self.obs)

        criterion = torch.nn.MSELoss()
        critic_loss = self.value_loss_coeff * criterion(values, self.est_rs)
        critic_loss.backward()
        self.optim.step()
        print("MSELoss for Value Net:{}".format(critic_loss.item()))
class TRPOAgent(A2CAgent):
    # def __init__(self,args,env_wrapper, continuous):
    def __init__(self, args, model, optim, env, data_generator, memory,
                 continuous):
        """
        Instantiate a TRPO agent
        """
        super(TRPOAgent, self).__init__(args, model, optim, env,
                                        data_generator, memory, continuous)

        self.max_kl = args.max_kl
        self.cg_damping = args.cg_damping
        self.cg_iters = args.cg_iters
        self.residual_tol = args.residual_tol

        self.algo = "trpo"

    def mean_kl_divergence(self, model):
        """
        Returns an estimate of the average KL divergence between a given model and self.policy_model
        """

        # actprob = model(self.observations).detach() + 1e-8
        # old_actprob = self.model(self.observations)

        def normal_distribution_kl_divergence(mean_old, std_old, mean_new,
                                              std_new):
            return torch.mean(torch.sum((torch.log(std_new) - torch.log(std_old) \
                                        + (std_old * std_old + (mean_old - mean_new) * (mean_old - mean_new)) \
                                        / (2.0 * std_new * std_new) \
                                        - 0.5), 1))

        if self.continuous:
            mean_new, std_new, _ = model(self.obs)
            mean_old, std_old, _ = self.model(self.obs)

            mean_new = mean_new.detach()
            std_new = std_new.detach()

            kl = normal_distribution_kl_divergence(mean_old, std_old, mean_new,
                                                   std_new)

        else:

            probs_new, _ = model(self.obs)
            probs_old, _ = self.model(self.obs)

            probs_new = probs_new.detach()

            probs_new = F.softmax(probs_new, dim=1)
            probs_old = F.softmax(probs_old, dim=1)

            kl = torch.sum(
                probs_old * torch.log(probs_old / (probs_new + 1e-8)),
                1).mean()

        return kl

    def hessian_vector_product(self, vector):
        """
        Returns the product of the Hessian of the KL divergence and the given vector
        """
        self.model.zero_grad()
        mean_kl_div = self.mean_kl_divergence(self.model)

        # mean_kl_div.backward( retain_graph=True, create_graph=True )
        kl_grad_vector = torch.autograd.grad(mean_kl_div,
                                             self.model.policy_parameters(),
                                             create_graph=True)

        kl_grad_vector = torch.cat([grad.view(-1) for grad in kl_grad_vector])
        grad_vector_product = torch.sum(kl_grad_vector * vector)

        second_order_grad = torch.autograd.grad(grad_vector_product,
                                                self.model.policy_parameters())

        fisher_vector_product = torch.cat(
            [grad.contiguous().view(-1) for grad in second_order_grad])

        return fisher_vector_product + self.cg_damping * vector.detach()

    def conjugate_gradient(self, b):
        """
        Returns F^(-1) b where F is the Hessian of the KL divergence
        """
        p = b.clone()
        r = b.clone()
        x = Tensor_zeros_like(p)
        rdotr = r.double().dot(r.double())

        for _ in range(self.cg_iters):
            z = self.hessian_vector_product(p).squeeze(0)
            v = (rdotr / p.double().dot(z.double())).float()

            x += v * p
            r -= v * z

            newrdotr = r.double().dot(r.double())
            mu = newrdotr / rdotr

            p = r + mu.float() * p
            rdotr = newrdotr
            if rdotr < self.residual_tol:
                break
        return x

    def surrogate_loss(self, theta):
        """
        Returns the surrogate loss w.r.t. the given parameter vector theta
        """
        theta = theta.detach()
        new_model = copy.deepcopy(self.model)
        # for param in new_model.parameters():
        #     print(param)
        vector_to_parameters(theta, new_model.policy_parameters())

        if self.continuous:
            mean_new, std_new, _ = new_model(self.obs)
            mean_old, std_old, _ = self.model(self.obs)

            dis_new = Normal(mean_new, std_new)
            dis_old = Normal(mean_old, std_old)

            log_prob_new = dis_new.log_prob(self.acts).sum(-1, keepdim=True)
            log_prob_old = dis_old.log_prob(self.acts).sum(-1, keepdim=True)

            ratio = torch.exp(log_prob_new - log_prob_old).detach()
        else:

            probs_new, _ = new_model(self.obs)
            probs_old, _ = self.model(self.obs)

            dis_new = F.softmax(probs_new, dim=1)
            dis_old = F.softmax(probs_old, dim=1)

            probs_new = dis_new.gather(1, self.acts).detach()
            probs_old = dis_old.gather(1, self.acts).detach() + 1e-8

            ratio = probs_new / probs_old

        return -torch.mean(ratio * self.advs)

    def linesearch(self, x, fullstep, expected_improve_rate):
        """
        Returns the parameter vector given by a linesearch
        """
        accept_ratio = .1
        max_backtracks = 10
        fval = self.surrogate_loss(x)
        for (_n_backtracks,
             stepfrac) in enumerate(.5**np.arange(max_backtracks)):
            print("Search number {}...".format(_n_backtracks + 1))
            stepfrac = float(stepfrac)
            xnew = x + stepfrac * fullstep
            newfval = self.surrogate_loss(xnew)
            actual_improve = fval - newfval

            expected_improve = expected_improve_rate * stepfrac

            ratio = actual_improve / expected_improve

            if ratio > accept_ratio and actual_improve > 0:
                return xnew
        return x.detach()

    def _optimize(self, obs, acts, advs, est_rs):

        self.obs, self.acts, self.advs, self.est_rs = obs, acts, advs, est_rs

        self.obs = Tensor(self.obs)
        self.acts = Tensor(self.acts)
        self.advs = Tensor(self.advs).unsqueeze(1)
        self.est_rs = Tensor(self.est_rs).unsqueeze(1)

        # Calculate Advantage & Normalize it
        self.advs = (self.advs - self.advs.mean()) / (self.advs.std() + 1e-8)

        # Surrogate loss with Entropy

        if self.continuous:
            mean, std, values = self.model(self.obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(self.acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

            probs_new = torch.exp(log_prob)
            probs_old = probs_new.detach() + 1e-8

        else:

            probs, values = self.model(self.obs)

            dis = F.softmax(probs, dim=1)

            self.acts = self.acts.long()

            probs_new = dis.gather(1, self.acts)
            probs_old = probs_new + 1e-8

            ent = -(dis.log() * dis).sum(-1)

        ratio = probs_new / probs_old

        surrogate_loss = -torch.mean(
            ratio * self.advs) - self.entropy_para * ent.mean()

        # criterion = torch.nn.MSELoss()
        # empty_value_loss = criterion( values, values.detach() )

        # Calculate the gradient of the surrogate loss
        self.model.zero_grad()
        surrogate_loss.backward()
        policy_gradient = parameters_to_vector([
            p.grad for p in self.model.policy_parameters()
        ]).squeeze(0).detach()

        # ensure gradient is not zero
        if policy_gradient.nonzero().size()[0]:
            # Use Conjugate gradient to calculate step direction
            step_direction = self.conjugate_gradient(-policy_gradient)
            # line search for step
            shs = .5 * step_direction.dot(
                self.hessian_vector_product(step_direction))

            lm = torch.sqrt(shs / self.max_kl)
            fullstep = step_direction / lm

            gdotstepdir = -policy_gradient.dot(step_direction)
            theta = self.linesearch(
                parameters_to_vector(self.model.policy_parameters()).detach(),
                fullstep, gdotstepdir / lm)
            # Update parameters of policy model
            old_model = copy.deepcopy(self.model)
            old_model.load_state_dict(self.model.state_dict())

            if any(np.isnan(theta.cpu().detach().numpy())):
                print("NaN detected. Skipping update...")
            else:
                # for param in self.model.policy_parameters():
                #     print(param)
                vector_to_parameters(theta, self.model.policy_parameters())

            kl_old_new = self.mean_kl_divergence(old_model)
            print('KL:{:10} , Entropy:{:10}'.format(kl_old_new.item(),
                                                    ent.mean().item()))

        else:
            print("Policy gradient is 0. Skipping update...")
            print(policy_gradient.shape)

        self.model.zero_grad()

        if self.continuous:
            _, _, values = self.model(self.obs)
        else:
            _, values = self.model(self.obs)

        criterion = torch.nn.MSELoss()
        critic_loss = self.value_loss_coeff * criterion(values, self.est_rs)
        critic_loss.backward()
        self.optim.step()
        print("MSELoss for Value Net:{}".format(critic_loss.item()))
Esempio n. 11
0
    def step(self):
        """
    Executes an iteration of TRPO
    """
        # Generate rollout
        all_observations, all_discounted_rewards, total_reward, all_actions, all_action_dists, self.entropy = self.sample_trajectories(
        )

        num_batches = len(all_actions) / self.batch_size if len(
            all_actions) % self.batch_size == 0 else (len(all_actions) /
                                                      self.batch_size) + 1
        for batch_num in range(num_batches):
            print("Processing batch number {}".format(batch_num + 1))
            self.observations = all_observations[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]
            self.discounted_rewards = all_discounted_rewards[batch_num *
                                                             self.batch_size:
                                                             (batch_num + 1) *
                                                             self.batch_size]
            self.actions = all_actions[batch_num *
                                       self.batch_size:(batch_num + 1) *
                                       self.batch_size]
            self.action_dists = all_action_dists[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]

            # Calculate the advantage of each step by taking the actual discounted rewards seen
            # and subtracting the estimated value of each state
            baseline = self.value_function_model.predict(
                self.observations).data
            discounted_rewards_tensor = Tensor(
                self.discounted_rewards).unsqueeze(1)
            advantage = discounted_rewards_tensor - baseline

            # Normalize the advantage
            self.advantage = (advantage -
                              advantage.mean()) / (advantage.std() + 1e-8)

            # Calculate the surrogate loss as the elementwise product of the advantage and the probability ratio of actions taken
            new_p = torch.cat(self.action_dists).gather(
                1, torch.cat(self.actions))
            old_p = new_p.detach() + 1e-8
            prob_ratio = new_p / old_p
            surrogate_loss = -torch.mean(prob_ratio * Variable(
                self.advantage)) - (self.ent_coeff * self.entropy)

            # Calculate the gradient of the surrogate loss
            self.policy_model.zero_grad()
            surrogate_loss.backward(retain_graph=True)
            policy_gradient = parameters_to_vector(
                [v.grad for v in self.policy_model.parameters()]).squeeze(0)

            if policy_gradient.nonzero().size()[0]:
                # Use conjugate gradient algorithm to determine the step direction in theta space
                step_direction = self.conjugate_gradient(-policy_gradient)
                step_direction_variable = Variable(
                    torch.from_numpy(step_direction))

                # Do line search to determine the stepsize of theta in the direction of step_direction
                shs = .5 * step_direction.dot(
                    self.hessian_vector_product(
                        step_direction_variable).cpu().numpy().T)
                lm = np.sqrt(shs / self.max_kl)
                fullstep = step_direction / lm
                gdotstepdir = -policy_gradient.dot(
                    step_direction_variable).data[0]
                theta = self.linesearch(
                    parameters_to_vector(self.policy_model.parameters()),
                    fullstep, gdotstepdir / lm)

                # Fit the estimated value function to the actual observed discounted rewards
                ev_before = math_utils.explained_variance_1d(
                    baseline.squeeze(1).cpu().numpy(), self.discounted_rewards)
                self.value_function_model.zero_grad()
                value_fn_params = parameters_to_vector(
                    self.value_function_model.parameters())
                self.value_function_model.fit(
                    self.observations,
                    Variable(Tensor(self.discounted_rewards)))
                ev_after = math_utils.explained_variance_1d(
                    self.value_function_model.predict(
                        self.observations).data.squeeze(1).cpu().numpy(),
                    self.discounted_rewards)
                if ev_after < ev_before or np.abs(ev_after) < 1e-4:
                    vector_to_parameters(
                        value_fn_params,
                        self.value_function_model.parameters())

                # Update parameters of policy model
                old_model = copy.deepcopy(self.policy_model)
                old_model.load_state_dict(self.policy_model.state_dict())
                if any(np.isnan(theta.data.cpu().numpy())):
                    print("NaN detected. Skipping update...")
                else:
                    vector_to_parameters(theta, self.policy_model.parameters())

                kl_old_new = self.mean_kl_divergence(old_model)
                diagnostics = collections.OrderedDict([
                    ('Total Reward', total_reward),
                    ('KL Old New', kl_old_new.data[0]),
                    ('Entropy', self.entropy.data[0]),
                    ('EV Before', ev_before), ('EV After', ev_after)
                ])
                for key, value in diagnostics.iteritems():
                    print("{}: {}".format(key, value))

            else:
                print("Policy gradient is 0. Skipping update...")

        return total_reward
Esempio n. 12
0
    def _optimize(self, obs, acts, advs, est_rs):

        self.optim.zero_grad()
        
        obs  = Tensor( obs )
        acts = Tensor( acts )
        advs = Tensor( advs ).unsqueeze(1)
        est_rs = Tensor( est_rs ).unsqueeze(1)

        if self.continuous:
            mean, std, values = self.model( obs )
            with torch.no_grad():
                mean_old, std_old, _ = self.model_old( obs )

            dis = Normal(mean, std)
            dis_old = Normal(mean_old, std_old)
            
            log_prob     = dis.log_prob( acts ).sum( -1, keepdim=True )
            log_prob_old = dis_old.log_prob( acts ).sum( -1, keepdim=True )

            ent = dis.entropy().sum( -1, keepdim=True )

            ratio = torch.exp(log_prob - log_prob_old)            

        else:

            probs, values = self.model(obs)
            with torch.no_grad():
                probs_old, _ = self.model_old(obs)

            dis = F.softmax( probs, dim = 1 )
            dis_old = F.softmax(probs_old, dim = 1 )

            acts = acts.long()

            probs = dis.gather( 1, acts )
            probs_old = dis_old.gather( 1, acts )

            # dis = Categorical( probs )
            # dis_old = Categorical( probs_old )
            ratio = probs / ( probs_old + 1e-8 )            

            # log_prob     = dis.log_prob( acts ).unsqueeze(1)
            # log_prob_old = dis_old.log_prob( acts ).unsqueeze(1)

            ent = -( dis.log() * dis ).sum(-1)


        # Normalize the advantage
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        surrogate_loss_pre_clip = ratio * advs
        surrogate_loss_clip = torch.clamp(ratio, 
                        1.0 - self.clip_para,
                        1.0 + self.clip_para) * advs

        # print("ratio min:{} max:{}".format(ratio.detach().min().item(), ratio.detach().max().item()))

        surrogate_loss = -torch.mean(torch.min(surrogate_loss_clip, surrogate_loss_pre_clip))

        policy_loss = surrogate_loss - self.entropy_para * ent.mean()

        criterion = nn.MSELoss()
        critic_loss = criterion( values, est_rs )
        # print("Critic Loss:{}".format(critic_loss.item()))

        self.writer.add_scalar( "Training/Critic_Loss", critic_loss.item(), self.step_count )
        loss = policy_loss + self.value_loss_coeff * critic_loss

        loss.backward()

        self.optim.step()

        self.step_count += 1