Ejemplo n.º 1
0
 def linesearch(self, x, fullstep, expected_improve_rate):
   """
   Returns the parameter vector given by a linesearch
   """
   accept_ratio = .1
   max_backtracks = 10
   fval = self.surrogate_loss(x)
   for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
     print("Search number {}...".format(_n_backtracks + 1))
     xnew = x.data.cpu().numpy() + stepfrac * fullstep
     newfval = self.surrogate_loss(Variable(torch.from_numpy(xnew)))
     actual_improve = fval - newfval
     expected_improve = expected_improve_rate * stepfrac
     ratio = actual_improve / expected_improve
     if ratio > accept_ratio and actual_improve > 0:
       return Variable(torch.from_numpy(xnew))
   return x
Ejemplo n.º 2
0
 def sample_action_from_policy(self, observation):
     """
 Given an observation, return the action sampled from the policy model as well as the probabilities associated with each action
 """
     observation_tensor = Tensor(observation).unsqueeze(0)
     probabilities = self.policy_model(
         Variable(observation_tensor, requires_grad=True))
     action = probabilities.multinomial(1)
     return action, probabilities
Ejemplo n.º 3
0
 def mean_kl_divergence(self, model):
   """
   Returns an estimate of the average KL divergence between a given model and self.policy_model
   """
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   actprob = model(observations_tensor).detach() + 1e-8
   old_actprob = self.policy_model(observations_tensor)
   return torch.sum(old_actprob * torch.log(old_actprob / actprob), 1).mean()
Ejemplo n.º 4
0
 def surrogate_loss(self, theta):
   """
   Returns the surrogate loss w.r.t. the given parameter vector theta
   """
   new_model = copy.deepcopy(self.policy_model)
   vector_to_parameters(theta, new_model.parameters())
   observations_tensor = torch.cat(
       [Variable(Tensor(observation)).unsqueeze(0) for observation in self.observations])
   prob_new = new_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data
   prob_old = self.policy_model(observations_tensor).gather(
       1, torch.cat(self.actions)).data + 1e-8
   return -torch.mean((prob_new / prob_old) * self.advantage)
Ejemplo n.º 5
0
 def conjugate_gradient(self, b):
     """
 Returns F^(-1)b where F is the Hessian of the KL divergence
 """
     p = b.clone().data
     r = b.clone().data
     x = np.zeros_like(b.data.cpu().numpy())
     rdotr = r.double().dot(r.double())
     for i in xrange(self.cg_iters):
         z = self.hessian_vector_product(Variable(p)).squeeze(0)
         v = rdotr / p.double().dot(z.double())
         x += v * p.cpu().numpy()
         r -= v * z
         newrdotr = r.double().dot(r.double())
         mu = newrdotr / rdotr
         p = r + mu * p
         rdotr = newrdotr
         if rdotr < self.residual_tol:
             break
     return x
Ejemplo n.º 6
0
    def step(self):
        """
    Executes an iteration of TRPO
    """
        # Generate rollout
        all_observations, all_discounted_rewards, total_reward, all_actions, all_action_dists, self.entropy = self.sample_trajectories(
        )

        num_batches = len(all_actions) / self.batch_size if len(
            all_actions) % self.batch_size == 0 else (len(all_actions) /
                                                      self.batch_size) + 1
        for batch_num in range(num_batches):
            print("Processing batch number {}".format(batch_num + 1))
            self.observations = all_observations[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]
            self.discounted_rewards = all_discounted_rewards[batch_num *
                                                             self.batch_size:
                                                             (batch_num + 1) *
                                                             self.batch_size]
            self.actions = all_actions[batch_num *
                                       self.batch_size:(batch_num + 1) *
                                       self.batch_size]
            self.action_dists = all_action_dists[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]

            # Calculate the advantage of each step by taking the actual discounted rewards seen
            # and subtracting the estimated value of each state
            baseline = self.value_function_model.predict(
                self.observations).data
            discounted_rewards_tensor = Tensor(
                self.discounted_rewards).unsqueeze(1)
            advantage = discounted_rewards_tensor - baseline

            # Normalize the advantage
            self.advantage = (advantage -
                              advantage.mean()) / (advantage.std() + 1e-8)

            # Calculate the surrogate loss as the elementwise product of the advantage and the probability ratio of actions taken
            new_p = torch.cat(self.action_dists).gather(
                1, torch.cat(self.actions))
            old_p = new_p.detach() + 1e-8
            prob_ratio = new_p / old_p
            surrogate_loss = -torch.mean(prob_ratio * Variable(
                self.advantage)) - (self.ent_coeff * self.entropy)

            # Calculate the gradient of the surrogate loss
            self.policy_model.zero_grad()
            surrogate_loss.backward(retain_graph=True)
            policy_gradient = parameters_to_vector(
                [v.grad for v in self.policy_model.parameters()]).squeeze(0)

            if policy_gradient.nonzero().size()[0]:
                # Use conjugate gradient algorithm to determine the step direction in theta space
                step_direction = self.conjugate_gradient(-policy_gradient)
                step_direction_variable = Variable(
                    torch.from_numpy(step_direction))

                # Do line search to determine the stepsize of theta in the direction of step_direction
                shs = .5 * step_direction.dot(
                    self.hessian_vector_product(
                        step_direction_variable).cpu().numpy().T)
                lm = np.sqrt(shs / self.max_kl)
                fullstep = step_direction / lm
                gdotstepdir = -policy_gradient.dot(
                    step_direction_variable).data[0]
                theta = self.linesearch(
                    parameters_to_vector(self.policy_model.parameters()),
                    fullstep, gdotstepdir / lm)

                # Fit the estimated value function to the actual observed discounted rewards
                ev_before = math_utils.explained_variance_1d(
                    baseline.squeeze(1).cpu().numpy(), self.discounted_rewards)
                self.value_function_model.zero_grad()
                value_fn_params = parameters_to_vector(
                    self.value_function_model.parameters())
                self.value_function_model.fit(
                    self.observations,
                    Variable(Tensor(self.discounted_rewards)))
                ev_after = math_utils.explained_variance_1d(
                    self.value_function_model.predict(
                        self.observations).data.squeeze(1).cpu().numpy(),
                    self.discounted_rewards)
                if ev_after < ev_before or np.abs(ev_after) < 1e-4:
                    vector_to_parameters(
                        value_fn_params,
                        self.value_function_model.parameters())

                # Update parameters of policy model
                old_model = copy.deepcopy(self.policy_model)
                old_model.load_state_dict(self.policy_model.state_dict())
                if any(np.isnan(theta.data.cpu().numpy())):
                    print("NaN detected. Skipping update...")
                else:
                    vector_to_parameters(theta, self.policy_model.parameters())

                kl_old_new = self.mean_kl_divergence(old_model)
                diagnostics = collections.OrderedDict([
                    ('Total Reward', total_reward),
                    ('KL Old New', kl_old_new.data[0]),
                    ('Entropy', self.entropy.data[0]),
                    ('EV Before', ev_before), ('EV After', ev_after)
                ])
                for key, value in diagnostics.iteritems():
                    print("{}: {}".format(key, value))

            else:
                print("Policy gradient is 0. Skipping update...")

        return total_reward