Example #1
0
 def linesearch(self, x, fullstep, expected_improve_rate):
     """
 Returns the parameter vector given by a linesearch
 """
     accept_ratio = .1
     max_backtracks = 10
     fval = self.surrogate_loss(x)
     for (_n_backtracks,
          stepfrac) in enumerate(.5**np.arange(max_backtracks)):
         print("Search number {}...".format(_n_backtracks + 1))
         xnew = x.data.cpu().numpy() + stepfrac * fullstep
         newfval = self.surrogate_loss(Variable(torch.from_numpy(xnew)))
         actual_improve = fval - newfval
         expected_improve = expected_improve_rate * stepfrac
         ratio = actual_improve / expected_improve
         if ratio > accept_ratio and actual_improve > 0:
             return Variable(torch.from_numpy(xnew))
     return x
Example #2
0
    def sample_trajectories(self):
        """
    Return a rollout
    """
        paths = []
        episodes_so_far = 0
        entropy = 0

        while episodes_so_far < self.episodes:
            episodes_so_far += 1
            observations, actions, rewards, action_distributions = [], [], [], []
            observation, opponent_obs = self.env.reset()
            while True:
                observations.append(observation)

                action, action_dist = self.sample_action_from_policy(
                    observation)
                actions.append(action)
                action_distributions.append(action_dist)
                entropy += -(action_dist * action_dist.log()).sum()

                agent_action = Variable(action).data.cpu().numpy()[0][0]
                opponent_action = self.opponent.get_action(opponent_obs)

                (observation,
                 opponent_obs), (reward,
                                 opponent_rew), done, _ = self.env.step(
                                     (agent_action, opponent_action))
                rewards.append(reward)

                if done:
                    path = {
                        "observations": observations,
                        "actions": actions,
                        "rewards": rewards,
                        "action_distributions": action_distributions
                    }
                    paths.append(path)
                    break

        def flatten(l):
            return [item for sublist in l for item in sublist]

        observations = flatten([path["observations"] for path in paths])
        discounted_rewards = flatten([
            math_utils.discount(path["rewards"], self.gamma) for path in paths
        ])
        total_reward = sum(flatten([path["rewards"]
                                    for path in paths])) / self.episodes
        actions = flatten([path["actions"] for path in paths])
        action_dists = flatten(
            [path["action_distributions"] for path in paths])
        entropy = entropy / len(actions)

        return observations, np.asarray(
            discounted_rewards), total_reward, actions, action_dists, entropy
Example #3
0
 def sample_action_from_policy(self, observation):
     """
 Given an observation, return the action sampled from the policy model as well as the probabilities associated with each action
 """
     observation_tensor = Tensor(observation)
     observation_tensor = observation_tensor.unsqueeze(0)
     observation_tensor = self.preprocess(observation_tensor)
     probabilities = self.policy_model(
         Variable(observation_tensor, requires_grad=True))
     action = probabilities.multinomial(1)
     return action, probabilities
Example #4
0
 def conjugate_gradient(self, b):
     """
 Returns F^(-1)b where F is the Hessian of the KL divergence
 """
     p = b.clone().data
     r = b.clone().data
     x = np.zeros_like(b.data.cpu().numpy())
     rdotr = r.double().dot(r.double())
     for _ in range(self.cg_iters):
         z = self.hessian_vector_product(Variable(p)).squeeze(0)
         v = rdotr / p.double().dot(z.double())
         x += v.cpu().numpy() * p.cpu().numpy()
         r -= v * z
         newrdotr = r.double().dot(r.double())
         mu = newrdotr / rdotr
         p = r + mu * p
         rdotr = newrdotr
         if rdotr < self.residual_tol:
             break
     return x
Example #5
0
    def step(self):
        """
    Executes an iteration of TRPO
    """
        # Generate rollout
        all_observations, all_discounted_rewards, total_reward, all_actions, all_action_dists, self.entropy = self.sample_trajectories(
        )

        num_batches = int(
            len(all_actions) / self.batch_size if len(all_actions) %
            self.batch_size == 0 else (len(all_actions) / self.batch_size) + 1)
        print("All actions {}".format(len(all_actions)))
        print("Num batches {}".format(num_batches))
        for batch_num in range(num_batches):
            print("Processing batch number {}".format(batch_num + 1))
            self.observations = all_observations[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]
            self.discounted_rewards = all_discounted_rewards[batch_num *
                                                             self.batch_size:
                                                             (batch_num + 1) *
                                                             self.batch_size]
            self.actions = all_actions[batch_num *
                                       self.batch_size:(batch_num + 1) *
                                       self.batch_size]
            self.action_dists = all_action_dists[batch_num *
                                                 self.batch_size:(batch_num +
                                                                  1) *
                                                 self.batch_size]

            self.observations = torch.cat([
                self.preprocess(Variable(Tensor(observation)).unsqueeze(0))
                for observation in self.observations
            ])

            obs_array = Variable(self.observations.data).cpu().numpy()

            # Calculate the advantage of each step by taking the actual discounted rewards seen
            # and subtracting the estimated value of each state
            baseline = self.value_function_model.predict(obs_array).data
            discounted_rewards_tensor = Tensor(
                self.discounted_rewards).unsqueeze(1)
            advantage = discounted_rewards_tensor - baseline

            # Normalize the advantage
            self.advantage = (advantage - advantage.mean()) / \
                             (advantage.std() + 1e-8)

            # Calculate the surrogate loss as the elementwise product of the advantage and the probability ratio of actions taken
            new_p = torch.cat(self.action_dists).gather(
                1, torch.cat(self.actions))
            old_p = new_p.detach() + 1e-8
            prob_ratio = new_p / old_p
            surrogate_loss = - \
                                 torch.mean(prob_ratio * Variable(self.advantage)) - \
                             (self.ent_coeff * self.entropy)

            # Calculate the gradient of the surrogate loss
            self.policy_model.zero_grad()
            surrogate_loss.backward(retain_graph=True)
            policy_gradient = parameters_to_vector(
                [v.grad for v in self.policy_model.parameters()]).squeeze(0)

            if policy_gradient.nonzero().size()[0]:
                # Use conjugate gradient algorithm to determine the step direction in theta space
                step_direction = self.conjugate_gradient(-policy_gradient)
                step_direction_variable = Variable(
                    torch.from_numpy(step_direction))

                # Do line search to determine the stepsize of theta in the direction of step_direction
                shs = .5 * \
                      step_direction.dot(self.hessian_vector_product(
                          step_direction_variable).cpu().numpy().T)
                lm = np.sqrt(shs / self.max_kl)
                fullstep = step_direction / lm
                gdotstepdir = -policy_gradient.dot(
                    step_direction_variable).data.numpy()
                theta = self.linesearch(
                    parameters_to_vector(self.policy_model.parameters()),
                    fullstep, gdotstepdir / lm)

                # Fit the estimated value function to the actual observed discounted rewards
                ev_before = math_utils.explained_variance_1d(
                    baseline.squeeze(1).cpu().numpy(), self.discounted_rewards)
                self.value_function_model.zero_grad()
                value_fn_params = parameters_to_vector(
                    self.value_function_model.parameters())
                self.value_function_model.fit(
                    self.observations,
                    Variable(Tensor(self.discounted_rewards)))
                ev_after = math_utils.explained_variance_1d(
                    self.value_function_model.predict(
                        self.observations).data.squeeze(1).cpu().numpy(),
                    self.discounted_rewards)
                if ev_after < ev_before or np.abs(ev_after) < 1e-4:
                    vector_to_parameters(
                        value_fn_params,
                        self.value_function_model.parameters())

                # Update parameters of policy model
                old_model = copy.deepcopy(self.policy_model)
                old_model.load_state_dict(self.policy_model.state_dict())
                if any(np.isnan(theta.data.cpu().numpy())):
                    print("NaN detected. Skipping update...")
                else:
                    vector_to_parameters(theta, self.policy_model.parameters())

                kl_old_new = self.mean_kl_divergence(old_model)
                diagnostics = collections.OrderedDict([
                    ('Total Reward', total_reward),
                    ('KL Old New', kl_old_new.data.numpy()),
                    ('Entropy', self.entropy.data.numpy()),
                    ('EV Before', ev_before), ('EV After', ev_after)
                ])
                for key, value in diagnostics.items():
                    print("{}: {}".format(key, value))

            else:
                print("Policy gradient is 0. Skipping update...")

        return total_reward
Example #6
0
 def get_action(self, observation):
     action, _ = self.sample_action_from_policy(observation)
     agent_action = Variable(action).data.cpu().numpy()[0][0]
     return agent_action