def train_sf(self, bootstrap_sf): rollout = np.array(self.episode_buffer_sf) observations = rollout[:, 0] feed_dict = { self.local_network.observation: np.stack(observations, axis=0) } fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict) sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0) } # , _, ms, sf_loss = \ self.sess.run([self.local_network.apply_grads_sf, self.local_network.merged_summary_sf, self.local_network.sf_loss], feed_dict=feed_dict) return ms, sf_loss
def train_sf(self, bootstrap_sf): rollout = np.array(self.episode_buffer_sf) observations = rollout[:, 0] next_observations = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] fi = rollout[:, 4] """Construct list of latent representations for the entire trajectory""" sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) """Construct the targets for the next step successor representations for the entire trajectory""" discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0), self.local_network.actions_placeholder: actions, self.local_network.target_next_obs: np.stack(next_observations, axis=0)} _, self.summaries_sf, sf_loss, _, self.summaries_aux, aux_loss = \ self.sess.run([self.local_network.apply_grads_sf, self.local_network.merged_summary_sf, self.local_network.sf_loss, self.local_network.apply_grads_aux, self.local_network.merged_summary_aux, self.local_network.aux_loss ], feed_dict=feed_dict)
def train_sf(self, rollout, sess, bootstrap_sf, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] # next_observations = rollout[:, 1] # actions = rollout[:, 2] feed_dict = {self.local_network.observation: np.stack(observations, axis=0)} fi = sess.run(self.local_network.fi, feed_dict=feed_dict) sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0)} # , # self.local_network.target_next_obs: np.stack(next_observations, axis=0), # self.local_network.actions_placeholder: actions} _, ms, sf_loss = \ sess.run([self.local_network.apply_grads_sf, self.local_network.merged_summary_sf, self.local_network.sf_loss], feed_dict=feed_dict) return ms, sf_loss
def train(self, rollout, bootstrap_sf): rollout = np.array(rollout) observations = rollout[:, 0] fi = np.identity(self.nb_states)[observations] """Construct list of one=hot encodings for the entire trajectory""" sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) """Construct the targets for the next step successor representations for the entire trajectory""" discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.identity(self.nb_states)[observations] } _, self.summaries, loss = \ self.sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.loss], feed_dict=feed_dict)
def train_sf(self, rollout, bootstrap_sf): rollout = np.array(rollout) observations = rollout[:, 0] # next_observations = rollout[:, 1] # actions = rollout[:, 2] """Get the latent representations for each state""" feed_dict = {self.local_network.observation: np.stack(observations, axis=0)} fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict) """Construct list of latent representations for the entire trajectory""" sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) """Construct the targets for the next step successor representations for the entire trajectory""" discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.stack(observations, axis=0)} # , _, self.summaries_sf, sf_loss = \ self.sess.run([self.local_network.apply_grads_sf, self.local_network.merged_summary_sf, self.local_network.sf_loss], feed_dict=feed_dict)
def train(self, rollout, sess, bootstrap_sf, summaries=False): rollout = np.array(rollout) observations = rollout[:, 0] # actions = rollout[:, 1] # sf = rollout[:, 2] # fi = rollout[:, 3] fi = np.identity(self.nb_states)[observations] sf_plus = np.asarray(fi.tolist() + [bootstrap_sf]) discounted_sf = discount(sf_plus, self.config.discount)[:-1] feed_dict = { self.local_network.target_sf: np.stack(discounted_sf, axis=0), self.local_network.observation: np.identity(self.nb_states)[observations] } _, ms, loss, sf_loss = \ sess.run([self.local_network.apply_grads, self.local_network.merged_summary, self.local_network.loss, self.local_network.sf_loss], feed_dict=feed_dict) return ms, loss, sf_loss
def train_option(self, bootstrap_value, bootstrap_value_mix): rollout = np.array( self.episode_buffer_option) # s, self.option, self.action, r, r_i observations = rollout[:, 0] options = rollout[:, 1] actions = rollout[:, 2] rewards = rollout[:, 3] eigen_rewards = rollout[:, 4] primitive_actions = rollout[:, 5] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1] options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \ observations1, observations2 = [], [], [], [], [], [], [], [] if self.config.eigen: eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix]) discounted_eigen_returns = discount(eigen_rewards_plus, self.config.discount)[:-1] discounted_eigen_returns1, discounted_eigen_returns2 = [], [] for i, primitive in enumerate(primitive_actions): if primitive: options1.append(options[i]) actions1.append(actions[i]) discounted_returns1.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns1.append( discounted_eigen_returns[i]) observations1.append(observations[i]) else: options2.append(options[i]) actions2.append(actions[i]) discounted_returns2.append(discounted_returns[i]) if self.config.eigen: discounted_eigen_returns2.append( discounted_eigen_returns[i]) observations2.append(observations[i]) if len(observations1) > 0: feed_dict = { self.local_network.target_return: discounted_returns1, self.local_network.observation: np.stack(observations1, axis=0), self.local_network.options_placeholder: options1 } to_run = [self.local_network.apply_grads_primitive_option] _ = self.sess.run(to_run, feed_dict=feed_dict) if len(observations2) > 0: feed_dict = { self.local_network.target_return: discounted_returns2, self.local_network.observation: np.stack(observations2, axis=0), self.local_network.actions_placeholder: actions2, self.local_network.options_placeholder: options2 } to_run = [ self.local_network.apply_grads_option, self.local_network.merged_summary_option, self.local_network.option_loss, self.local_network.policy_loss, self.local_network.entropy_loss, self.local_network.critic_loss, self.local_network.term_loss ] if self.config.eigen: feed_dict[self.local_network. target_eigen_return] = discounted_eigen_returns2 to_run.append(self.local_network.eigen_critic_loss) results = self.sess.run(to_run, feed_dict=feed_dict) results.append(discounted_returns[-1]) if self.config.eigen: results.append(discounted_eigen_returns[-1]) else: return None return results[1:]