def linesearch(self, x, fullstep, expected_improve_rate): """ Returns the parameter vector given by a linesearch """ accept_ratio = .1 max_backtracks = 10 fval = self.surrogate_loss(x) for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): print("Search number {}...".format(_n_backtracks + 1)) xnew = x.data.cpu().numpy() + stepfrac * fullstep newfval = self.surrogate_loss(Variable(torch.from_numpy(xnew))) actual_improve = fval - newfval expected_improve = expected_improve_rate * stepfrac ratio = actual_improve / expected_improve if ratio > accept_ratio and actual_improve > 0: return Variable(torch.from_numpy(xnew)) return x
def sample_trajectories(self): """ Return a rollout """ paths = [] episodes_so_far = 0 entropy = 0 while episodes_so_far < self.episodes: episodes_so_far += 1 observations, actions, rewards, action_distributions = [], [], [], [] observation, opponent_obs = self.env.reset() while True: observations.append(observation) action, action_dist = self.sample_action_from_policy( observation) actions.append(action) action_distributions.append(action_dist) entropy += -(action_dist * action_dist.log()).sum() agent_action = Variable(action).data.cpu().numpy()[0][0] opponent_action = self.opponent.get_action(opponent_obs) (observation, opponent_obs), (reward, opponent_rew), done, _ = self.env.step( (agent_action, opponent_action)) rewards.append(reward) if done: path = { "observations": observations, "actions": actions, "rewards": rewards, "action_distributions": action_distributions } paths.append(path) break def flatten(l): return [item for sublist in l for item in sublist] observations = flatten([path["observations"] for path in paths]) discounted_rewards = flatten([ math_utils.discount(path["rewards"], self.gamma) for path in paths ]) total_reward = sum(flatten([path["rewards"] for path in paths])) / self.episodes actions = flatten([path["actions"] for path in paths]) action_dists = flatten( [path["action_distributions"] for path in paths]) entropy = entropy / len(actions) return observations, np.asarray( discounted_rewards), total_reward, actions, action_dists, entropy
def sample_action_from_policy(self, observation): """ Given an observation, return the action sampled from the policy model as well as the probabilities associated with each action """ observation_tensor = Tensor(observation) observation_tensor = observation_tensor.unsqueeze(0) observation_tensor = self.preprocess(observation_tensor) probabilities = self.policy_model( Variable(observation_tensor, requires_grad=True)) action = probabilities.multinomial(1) return action, probabilities
def conjugate_gradient(self, b): """ Returns F^(-1)b where F is the Hessian of the KL divergence """ p = b.clone().data r = b.clone().data x = np.zeros_like(b.data.cpu().numpy()) rdotr = r.double().dot(r.double()) for _ in range(self.cg_iters): z = self.hessian_vector_product(Variable(p)).squeeze(0) v = rdotr / p.double().dot(z.double()) x += v.cpu().numpy() * p.cpu().numpy() r -= v * z newrdotr = r.double().dot(r.double()) mu = newrdotr / rdotr p = r + mu * p rdotr = newrdotr if rdotr < self.residual_tol: break return x
def step(self): """ Executes an iteration of TRPO """ # Generate rollout all_observations, all_discounted_rewards, total_reward, all_actions, all_action_dists, self.entropy = self.sample_trajectories( ) num_batches = int( len(all_actions) / self.batch_size if len(all_actions) % self.batch_size == 0 else (len(all_actions) / self.batch_size) + 1) print("All actions {}".format(len(all_actions))) print("Num batches {}".format(num_batches)) for batch_num in range(num_batches): print("Processing batch number {}".format(batch_num + 1)) self.observations = all_observations[batch_num * self.batch_size:(batch_num + 1) * self.batch_size] self.discounted_rewards = all_discounted_rewards[batch_num * self.batch_size: (batch_num + 1) * self.batch_size] self.actions = all_actions[batch_num * self.batch_size:(batch_num + 1) * self.batch_size] self.action_dists = all_action_dists[batch_num * self.batch_size:(batch_num + 1) * self.batch_size] self.observations = torch.cat([ self.preprocess(Variable(Tensor(observation)).unsqueeze(0)) for observation in self.observations ]) obs_array = Variable(self.observations.data).cpu().numpy() # Calculate the advantage of each step by taking the actual discounted rewards seen # and subtracting the estimated value of each state baseline = self.value_function_model.predict(obs_array).data discounted_rewards_tensor = Tensor( self.discounted_rewards).unsqueeze(1) advantage = discounted_rewards_tensor - baseline # Normalize the advantage self.advantage = (advantage - advantage.mean()) / \ (advantage.std() + 1e-8) # Calculate the surrogate loss as the elementwise product of the advantage and the probability ratio of actions taken new_p = torch.cat(self.action_dists).gather( 1, torch.cat(self.actions)) old_p = new_p.detach() + 1e-8 prob_ratio = new_p / old_p surrogate_loss = - \ torch.mean(prob_ratio * Variable(self.advantage)) - \ (self.ent_coeff * self.entropy) # Calculate the gradient of the surrogate loss self.policy_model.zero_grad() surrogate_loss.backward(retain_graph=True) policy_gradient = parameters_to_vector( [v.grad for v in self.policy_model.parameters()]).squeeze(0) if policy_gradient.nonzero().size()[0]: # Use conjugate gradient algorithm to determine the step direction in theta space step_direction = self.conjugate_gradient(-policy_gradient) step_direction_variable = Variable( torch.from_numpy(step_direction)) # Do line search to determine the stepsize of theta in the direction of step_direction shs = .5 * \ step_direction.dot(self.hessian_vector_product( step_direction_variable).cpu().numpy().T) lm = np.sqrt(shs / self.max_kl) fullstep = step_direction / lm gdotstepdir = -policy_gradient.dot( step_direction_variable).data.numpy() theta = self.linesearch( parameters_to_vector(self.policy_model.parameters()), fullstep, gdotstepdir / lm) # Fit the estimated value function to the actual observed discounted rewards ev_before = math_utils.explained_variance_1d( baseline.squeeze(1).cpu().numpy(), self.discounted_rewards) self.value_function_model.zero_grad() value_fn_params = parameters_to_vector( self.value_function_model.parameters()) self.value_function_model.fit( self.observations, Variable(Tensor(self.discounted_rewards))) ev_after = math_utils.explained_variance_1d( self.value_function_model.predict( self.observations).data.squeeze(1).cpu().numpy(), self.discounted_rewards) if ev_after < ev_before or np.abs(ev_after) < 1e-4: vector_to_parameters( value_fn_params, self.value_function_model.parameters()) # Update parameters of policy model old_model = copy.deepcopy(self.policy_model) old_model.load_state_dict(self.policy_model.state_dict()) if any(np.isnan(theta.data.cpu().numpy())): print("NaN detected. Skipping update...") else: vector_to_parameters(theta, self.policy_model.parameters()) kl_old_new = self.mean_kl_divergence(old_model) diagnostics = collections.OrderedDict([ ('Total Reward', total_reward), ('KL Old New', kl_old_new.data.numpy()), ('Entropy', self.entropy.data.numpy()), ('EV Before', ev_before), ('EV After', ev_after) ]) for key, value in diagnostics.items(): print("{}: {}".format(key, value)) else: print("Policy gradient is 0. Skipping update...") return total_reward
def get_action(self, observation): action, _ = self.sample_action_from_policy(observation) agent_action = Variable(action).data.cpu().numpy()[0][0] return agent_action