def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False): """ Expects a batch containing at least one episode, sets advantages according to use_gae. :param batch: Sequence of observations for at least one episode. :param batch: :param gamma: :param gae_lambda: :param use_gae: :return: """ for episode in batch: baseline = self.baseline_value_function.predict(episode) if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) episode['returns'] = discount(episode['rewards'], gamma) if use_gae: deltas = episode['rewards'] + gamma * adjusted_baseline[1:] - adjusted_baseline[:-1] episode['advantage'] = discount(deltas, gamma * gae_lambda) else: episode['advantage'] = episode['returns'] - baseline
def generalised_advantage_estimation(self, episode): """ Expects an episode, returns advantages according to config. """ baseline = self.baseline_value_function.predict(episode) if self.generalized_advantage_estimation: if episode['terminated']: adjusted_baseline = np.append(baseline, [0]) else: adjusted_baseline = np.append(baseline, baseline[-1]) deltas = episode['rewards'] + self.gamma * adjusted_baseline[ 1:] - adjusted_baseline[:-1] advantage = discount(deltas, self.gamma * self.gae_lambda) else: advantage = episode['returns'] - baseline if self.normalize_advantage: return zero_mean_unit_variance(advantage) else: return advantage
def update(self, batch): """ Get global parameters, compute update, then send results to parameter server. :param batch: :return: """ for episode in batch: episode['returns'] = discount(episode['rewards'], self.gamma) episode['advantages'] = self.generalised_advantage_estimation( episode) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) fetches = [self.loss, self.optimize_op, self.global_step] fetches.extend(self.local_network.internal_state_outputs) print(len(batch)) print(batch[0]['episode_length']) # Merge episode inputs into single arrays feed_dict = { self.episode_length: [episode['episode_length'] for episode in batch], self.state: [episode['states'] for episode in batch], self.actions: [episode['actions'] for episode in batch], self.advantage: [episode['advantages'] for episode in batch] } for n, internal_state in enumerate( self.local_network.internal_state_inputs): feed_dict[internal_state] = self.local_states[n] fetched = self.session.run(fetches, feed_dict) loss = fetched[0] self.local_states = fetched[3:] self.logger.debug('Distributed model loss = ' + str(loss))
def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the vanilla policy gradient. :param batch: :return: """ # Set per episode return and advantage for episode in batch: episode['returns'] = discount(episode['rewards'], self.gamma) episode['advantages'] = self.advantage_estimation(episode) # Update linear value function for baseline prediction self.baseline_value_function.fit(batch) fetches = [self.optimize_op, self.log_probabilities, self.loss] fetches.extend(self.network.internal_state_outputs) feed_dict = { self.episode_length: [episode['episode_length'] for episode in batch], self.state: [episode['states'] for episode in batch], self.actions: [episode['actions'] for episode in batch], self.advantage: [episode['advantages'] for episode in batch] } for n, internal_state in enumerate(self.network.internal_state_inputs): feed_dict[internal_state] = self.internal_states[n] fetched = self.session.run(fetches, feed_dict) loss = fetched[2] self.internal_states = fetched[3:] self.logger.debug('Vanilla policy gradient loss = ' + str(loss))