def __init__(self, simulated=False): """ :param simulated: bool if the environment is for real robot or simulation """ Serializable.quick_init(self, locals()) np.random.RandomState(get_seed()) self._initial_setup()
def populate_task(self, env, policy, scope=None): logger.log("Populating workers...") if singleton_pool.n_parallel > 1: singleton_pool.run_each( _worker_populate_task, [(pickle.dumps(env), pickle.dumps(policy), scope)] * singleton_pool.n_parallel) else: # avoid unnecessary copying g = parallel_sampler._get_scoped_g(singleton_pool.G, scope) g.env = env g.policy = policy parallel_sampler.set_seed(ext.get_seed()) logger.log("Populated")
def _build_graph(self, from_latent_input, from_obs_input): action_dim = self.action_space.flat_dim small = 1e-5 with self._variable_scope: with tf.variable_scope("concat_latent_obs"): latent_obs_input = tf.concat( [from_latent_input, from_obs_input], axis=-1) with tf.variable_scope("dist_params"): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) b = tf.constant_initializer(b) mean_std_network = mlp( with_input=latent_obs_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, # hidden_w_init=tf.orthogonal_initializer(1.0), # output_w_init=tf.orthogonal_initializer(1.0), output_b_init=b, name="mean_std_network") with tf.variable_scope("mean_network"): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope("std_network"): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( with_input=latent_obs_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name="mean_network") # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( with_input=latent_obs_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name="std_network") else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(with_input=latent_obs_input, length=action_dim, initializer=p, trainable=self._learn_std, name="std_network") mean_var = mean_network std_param_var = std_network with tf.variable_scope("std_limits"): if self._min_std_param: std_param_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_param_var = tf.minimum(std_param_var, self._max_std_param) with tf.variable_scope("std_parameterization"): # build std_var with std parameterization if self._std_parameterization == "exp": std_var = tf.exp(std_param_var) elif self._std_parameterization == "softplus": std_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError dist = tf.contrib.distributions.MultivariateNormalDiag( mean_var, std_var) action_var = dist.sample(seed=ext.get_seed()) return action_var, mean_var, std_param_var, dist
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network log_std_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': pass elif self._std_parameterization == 'softplus': softplus_std_var = tf.log(1. + tf.exp(log_std_var)) log_std_var = tf.log(softplus_std_var) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: log_std_var = tf.maximum(log_std_var, self._min_std_param) if self._max_std_param: log_std_var = tf.minimum(log_std_var, self._max_std_param) distribution = tfp.distributions.MultivariateNormalDiag( mean_var, tf.exp(log_std_var)) action_var = distribution.sample(seed=ext.get_seed()) return action_var, log_std_var, distribution
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network std_param_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': std_param_var = std_param_var elif self._std_parameterization == 'softplus': std_param_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: std_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_var = tf.minimum(std_param_var, self._max_std_param) dist = DiagonalGaussian(action_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=ext.get_seed()) action_var = rnd * tf.exp(std_var) + mean_var return action_var, mean_var, std_var, std_param_var, dist