class PG_concurrent(BatchPolopt): """ Designed to enable concurrent training of a SNN that parameterizes skills and also train the manager at the same time Note that, if I'm not trying to do the sample approximation of the weird log of sum term, I don't need to know which skill was picked, just need to know the action """ # double check this constructor later def __init__( self, manager_optimizer=None, optimizer=None, snn_optimizer=None, optimizer_args=None, step_size=1e-6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer( learning_rate=step_size, **optimizer_args) # I hope this is right self.manager_optimizer = manager_optimizer self.snn_optimizer = snn_optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent, self).__init__(**kwargs) # not sure if this line is correct self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] # initialize the computation graph # optimize is run on >= 1 trajectory at a time def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config. floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) # obs_var = obs_var_raw # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = [ TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info)) for dist_info in dist_info_vars ] # need to reshape at the end reshaped_probs = [ TT.reshape(prob, [obs_var.shape[0] // self.period, self.period]) for prob in probs ] # now, multiply out each row and concatenate subtrajectory_probs = TT.stack([ TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs ], axis=1) # shape error might come out of here # elementwise multiplication, then sum up each individual row and take log likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1)) surr_loss = -TT.mean(likelihood * advantage_var) input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() #do the optimization def optimize_policy(self, itr, samples_data): assert len(samples_data) // self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) # print(input_values[0].shape) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) # obs_raw = input_values[0] obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = np.sum(input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period]), axis=1) all_input_values = (obs_raw, obs_sparse, input_values[1], advantage_sparse) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def optimize_manager(self, itr, samples_data): pass def optimize_snn(self, itr, samples_data): pass def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): #paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class PG_concurrent_approx(BatchPolopt, Serializable ): # todo: should this implement serializable? """ Designed to enable concurrent training of a SNN that parameterizes skills and also train the manager at the same time Note that, if I'm not trying to do the sample approximation of the weird log of sum term, I don't need to know which skill was picked, just need to know the action """ # double check this constructor later def __init__( self, optimizer=None, optimizer_args=None, step_size=1e-2, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, use_skill_dependent_baseline=False, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict(batch_size=None, max_epochs=1) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(learning_rate=step_size, **optimizer_args) self.optimizer = optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent_approx, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period self.trainable_manager = self.policy.trainable_manager # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space skill_dependent_obs_space_dim = ( (curr_env.observation_space.shape[0] + 1) * self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_depdendent_env_spec) # initialize the computation graph # optimize is run on >= 1 trajectory at a time def init_opt(self): # obs_var_raw = self.env.observation_space.new_tensor_variable( # 'obs', # extra_dims=1, # ) obs_var_raw = ext.new_tensor( 'obs', ndim=3, dtype=theano.config.floatX) # todo: check the dtype action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every self.period timesteps advantage_var_sparse = ext.new_tensor('sparse_advantage', ndim=1, dtype=theano.config.floatX) advantage_var = ext.new_tensor('advantage', ndim=1, dtype=theano.config.floatX) obs_var_sparse = ext.new_tensor( 'sparse_obs', ndim=2, dtype=theano.config. floatX # todo: check this with carlos, refer to discrete.py in rllab.spaces ) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid # undoing the reshape, so that batch sampling is ok obs_var = TT.reshape(obs_var_raw, [ obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2] ]) # obs_var = obs_var_raw ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( obs_var_sparse)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) if self.trainable_manager: manager_surr_loss = -TT.mean( TT.log(actual_latent_probs) * advantage_var_sparse) else: manager_surr_loss = 0 ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ # get the distribution parameters # dist_info_vars = [] # for latent in self.latents: # self.policy.low_policy.set_latent_train(latent) # dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var)) # hopefully the above line takes multiple samples, and state_info_vars not needed as input dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars ], axis=1) # todo: verify that dist_info_vars is in order actual_action_log_probs = TT.sum(probs * latent_var, axis=1) skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var) surr_loss = manager_surr_loss / self.period + skill_surr_loss # so that the relative magnitudes are correct input_list = [ obs_var_raw, obs_var_sparse, action_var, advantage_var, advantage_var_sparse, latent_var, latent_var_sparse ] # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var] # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() # do the optimization def optimize_policy(self, itr, samples_data): # import IPython; IPython.embed() print(len(samples_data['observations']), self.period) assert len(samples_data['observations']) % self.period == 0 # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse if self.use_skill_dependent_baseline: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos", "skill_advantages")) else: input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) # print(input_values[0].shape) obs_raw = input_values[0].reshape( input_values[0].shape[0] // self.period, self.period, input_values[0].shape[1]) # obs_raw = input_values[0] obs_sparse = input_values[0].take( [i for i in range(0, input_values[0].shape[0], self.period)], axis=0) advantage_sparse = input_values[2].reshape( [input_values[2].shape[0] // self.period, self.period])[:, 0] latents = input_values[3]['latents'] latents_sparse = latents.take( [i for i in range(0, latents.shape[0], self.period)], axis=0) if self.use_skill_dependent_baseline: all_input_values = (obs_raw, obs_sparse, input_values[1], input_values[4], advantage_sparse, latents, latents_sparse) else: all_input_values = (obs_raw, obs_sparse, input_values[1], input_values[2], advantage_sparse, latents, latents_sparse) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class Hippo(BatchPolopt): def __init__( self, optimizer=None, optimizer_args=None, step_size=0.0003, latents=None, # some sort of iterable of the actual latent vectors average_period=10, # average over all the periods truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, use_skill_dependent_baseline=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Hippo, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.average_period = average_period # import pdb; pdb.set_trace() self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] self.use_skill_dependent_baseline = use_skill_dependent_baseline assert isinstance(self.policy, HierarchicalPolicy) self.old_policy = copy.deepcopy(self.policy) def init_opt(self): obs_var = ext.new_tensor( 'obs', ndim=2, dtype=theano.config.floatX) # todo: check the dtype manager_obs_var = ext.new_tensor('manager_obs', ndim=2, dtype=theano.config.floatX) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) # this will have to be the advantage every time the manager makes a decision manager_advantage_var = ext.new_tensor('manager_advantage', ndim=1, dtype=theano.config.floatX) skill_advantage_var = ext.new_tensor('skill_advantage', ndim=1, dtype=theano.config.floatX) latent_var_sparse = ext.new_tensor('sparse_latent', ndim=2, dtype=theano.config.floatX) latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX) assert isinstance(self.policy, HierarchicalPolicy) ############################################################# ### calculating the manager portion of the surrogate loss ### ############################################################# # i, j should contain the probability of latent j at time step self.period*i # should be a len(obs)//self.period by len(self.latent) tensor latent_probs = self.policy.manager.dist_info_sym( manager_obs_var)['prob'] old_latent_probs = self.old_policy.manager.dist_info_sym( manager_obs_var)['prob'] actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1) old_actual_latent_probs = TT.sum(old_latent_probs * latent_var_sparse, axis=1) lr = TT.exp( TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs)) manager_surr_loss_vector = TT.minimum( lr * manager_advantage_var, TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) * manager_advantage_var) manager_surr_loss = -TT.mean(manager_surr_loss_vector) ############################################################ ### calculating the skills portion of the surrogate loss ### ############################################################ dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents( obs_var) probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in dist_info_vars ], axis=1) actual_action_log_probs = TT.sum( probs * latent_var, axis=1) # todo: verify that dist_info_vars is in order # old policy stuff old_dist_info_vars = self.old_policy.low_policy.dist_info_sym_all_latents( obs_var) old_probs = TT.stack([ self.diagonal.log_likelihood_sym(action_var, dist_info) for dist_info in old_dist_info_vars ], axis=1) old_actual_action_log_probs = TT.sum(old_probs * latent_var, axis=1) skill_lr = TT.exp(actual_action_log_probs - old_actual_action_log_probs) skill_surr_loss_vector = TT.minimum( skill_lr * skill_advantage_var, TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) * skill_advantage_var) skill_surr_loss = -TT.mean(skill_surr_loss_vector) surr_loss = manager_surr_loss / self.average_period + skill_surr_loss input_list = [ obs_var, manager_obs_var, action_var, manager_advantage_var, skill_advantage_var, latent_var, latent_var_sparse ] self.optimizer.update_opt(loss=surr_loss, target=self.policy, inputs=input_list) return dict() # do the optimization def optimize_policy(self, itr, samples_data): # print(len(samples_data['observations']), self.period) # assert len(samples_data['observations']) % self.period == 0 assert not self.use_skill_dependent_baseline # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages", "agent_infos")) time_remaining = input_values[3]['time_remaining'] resampled_period = input_values[3]['resampled_period'] obs_var = np.insert(input_values[0], self.policy.obs_robot_dim, time_remaining, axis=1) manager_obs_var = obs_var[resampled_period] action_var = input_values[1] manager_adv_var = input_values[2][resampled_period] skill_adv_var = input_values[2] latent_var = input_values[3]['latents'] latent_var_sparse = latent_var[resampled_period] all_input_values = (obs_var, manager_obs_var, action_var, manager_adv_var, skill_adv_var, latent_var, latent_var_sparse) # todo: assign current parameters to old policy; does this work? old_param_values = self.policy.get_param_values() self.old_policy.set_param_values(old_param_values) loss_before = self.optimizer.loss(all_input_values) self.optimizer.optimize(all_input_values) loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict() def get_itr_snapshot(self, itr, samples_data): return dict(itr=itr, policy=self.policy, baseline=self.baseline, env=self.env) def log_diagnostics(self, paths): # paths obtained by self.sampler.obtain_samples BatchPolopt.log_diagnostics(self, paths)
class StochasticGaussianMLPPolicy(StochasticPolicy, LasagnePowered, Serializable): def __init__( self, env_spec, input_latent_vars=None, hidden_sizes=(32, 32), hidden_latent_vars=None, learn_std=True, init_std=1.0, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, ): Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network mean_network = StochasticMLP( input_shape=(obs_dim, ), input_latent_vars=input_latent_vars, output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_latent_vars=hidden_latent_vars, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._n_latent_layers = len(mean_network.latent_layers) self._l_mean = l_mean self._l_log_std = l_log_std LasagnePowered.__init__(self, [l_mean, l_log_std]) super(StochasticGaussianMLPPolicy, self).__init__(env_spec) outputs = self.dist_info_sym(mean_network.input_var) latent_keys = sorted( set(outputs.keys()).difference({"mean", "log_std"})) extras = get_full_output([self._l_mean, self._l_log_std] + self._mean_network.latent_layers, )[1] latent_distributions = [ extras[layer]["distribution"] for layer in self._mean_network.latent_layers ] self._latent_keys = latent_keys self._latent_distributions = latent_distributions self._dist = DiagonalGaussian(action_dim) self._f_dist_info = ext.compile_function( inputs=[obs_var], outputs=outputs, ) self._f_dist_info_givens = None @property def latent_layers(self): return self._mean_network.latent_layers @property def latent_dims(self): return self._mean_network.latent_dims def dist_info(self, obs, state_infos=None): if state_infos is None or len(state_infos) == 0: return self._f_dist_info(obs) if self._f_dist_info_givens is None: # compile function obs_var = self._mean_network.input_var latent_keys = [ "latent_%d" % idx for idx in range(self._n_latent_layers) ] latent_vars = [ TT.matrix("latent_%d" % idx) for idx in range(self._n_latent_layers) ] latent_dict = dict(list(zip(latent_keys, latent_vars))) self._f_dist_info_givens = ext.compile_function( inputs=[obs_var] + latent_vars, outputs=self.dist_info_sym(obs_var, latent_dict), ) latent_vals = [] for idx in range(self._n_latent_layers): latent_vals.append(state_infos["latent_%d" % idx]) return self._f_dist_info_givens(*[obs] + latent_vals) def reset(self): #here I would sample a latents var. # sample latents # store it in self.something that then goes to all the others pass def dist_info_sym(self, obs_var, state_info_vars=None): if state_info_vars is not None: latent_givens = { latent_layer: state_info_vars["latent_%d" % idx] for idx, latent_layer in enumerate( self._mean_network.latent_layers) } latent_dist_infos = dict() for idx, latent_layer in enumerate( self._mean_network.latent_layers): cur_dist_info = dict() prefix = "latent_%d_" % idx for k, v in state_info_vars.items(): if k.startswith(prefix): cur_dist_info[k[len(prefix):]] = v latent_dist_infos[latent_layer] = cur_dist_info else: latent_givens = dict() latent_dist_infos = dict() all_outputs, extras = get_full_output( [self._l_mean, self._l_log_std] + self._mean_network.latent_layers, inputs={self._mean_network._l_in: obs_var}, latent_givens=latent_givens, latent_dist_infos=latent_dist_infos, ) mean_var = all_outputs[0] log_std_var = all_outputs[1] latent_vars = all_outputs[2:] latent_dist_infos = [] for latent_layer in self._mean_network.latent_layers: latent_dist_infos.append(extras[latent_layer]["dist_info"]) output_dict = dict(mean=mean_var, log_std=log_std_var) for idx, latent_var, latent_dist_info in zip(itertools.count(), latent_vars, latent_dist_infos): output_dict["latent_%d" % idx] = latent_var for k, v in latent_dist_info.items(): output_dict["latent_%d_%s" % (idx, k)] = v return output_dict def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of distributions of both the actions and the latents variables """ kl = self._dist.kl_sym(old_dist_info_vars, new_dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): # collect dist info for each latents variable prefix = "latent_%d_" % idx old_latent_dist_info = { k[len(prefix):]: v for k, v in old_dist_info_vars.items() if k.startswith(prefix) } new_latent_dist_info = { k[len(prefix):]: v for k, v in new_dist_info_vars.items() if k.startswith(prefix) } kl += latent_dist.kl_sym(old_latent_dist_info, new_latent_dist_info) return kl def likelihood_ratio_sym(self, action_var, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic likelihood ratio of both the actions and the latents variables. """ lr = self._dist.likelihood_ratio_sym(action_var, old_dist_info_vars, new_dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): latent_var = old_dist_info_vars["latent_%d" % idx] prefix = "latent_%d_" % idx old_latent_dist_info = { k[len(prefix):]: v for k, v in old_dist_info_vars.items() if k.startswith(prefix) } new_latent_dist_info = { k[len(prefix):]: v for k, v in new_dist_info_vars.items() if k.startswith(prefix) } lr *= latent_dist.likelihood_ratio_sym(latent_var, old_latent_dist_info, new_latent_dist_info) return lr def log_likelihood(self, actions, dist_info, action_only=False): """ Computes the log likelihood of both the actions and the latents variables, unless action_only is set to True, in which case it will only compute the log likelihood of the actions. :return: """ logli = self._dist.log_likelihood(actions, dist_info) if not action_only: for idx, latent_dist in enumerate(self._latent_distributions): latent_var = dist_info["latent_%d" % idx] prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info.items() if k.startswith(prefix) } logli += latent_dist.log_likelihood(latent_var, latent_dist_info) return logli def log_likelihood_sym(self, action_var, dist_info_vars): logli = self._dist.log_likelihood_sym(action_var, dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): latent_var = dist_info_vars["latent_%d" % idx] prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info_vars.items() if k.startswith(prefix) } logli += latent_dist.log_likelihood_sym(latent_var, latent_dist_info) return logli def entropy(self, dist_info): ent = self._dist.entropy(dist_info) for idx, latent_dist in enumerate(self._latent_distributions): prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info.items() if k.startswith(prefix) } ent += latent_dist.entropy(latent_dist_info) return ent @property def dist_info_keys(self): return ["mean", "log_std"] + self._latent_keys @overrides def get_action(self, observation): actions, outputs = self.get_actions([observation]) return actions[0], {k: v[0] for k, v in outputs.items()} def get_actions(self, observations): outputs = self._f_dist_info(observations) mean = outputs["mean"] log_std = outputs["log_std"] rnd = np.random.normal(size=mean.shape) actions = rnd * np.exp(log_std) + mean return actions, outputs def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds))) @property def distribution(self): """ We set the distribution to the policy itself since we need some behavior different from a usual diagonal Gaussian distribution. """ return self @property def state_info_keys(self): return self._latent_keys