def __init__(self, output_dim, hidden_dim=32, name=None, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), recurrent_nonlinearity=tf.nn.sigmoid, recurrent_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), hidden_state_init=tf.zeros_initializer(), hidden_state_init_trainable=False, cell_state_init=tf.zeros_initializer(), cell_state_init_trainable=False, forget_bias=True, learn_std=True, init_std=1.0, std_share_network=False, layer_normalization=False): super().__init__(name) self._output_dim = output_dim self._hidden_dim = hidden_dim self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._recurrent_nonlinearity = recurrent_nonlinearity self._recurrent_w_init = recurrent_w_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._hidden_state_init = hidden_state_init self._hidden_state_init_trainable = hidden_state_init_trainable self._cell_state_init = cell_state_init self._cell_state_init_trainable = cell_state_init_trainable self._forget_bias = forget_bias self._layer_normalization = layer_normalization self._learn_std = learn_std self._std_share_network = std_share_network # pylint: disable=assignment-from-no-return self._init_std_param = np.log(init_std) self._initialize()
def _initialize(self): """Build policy to support sampling. After build, get_action_*() methods will be available. """ obs_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, self.obs_dim)) encoder_input = tf.compat.v1.placeholder( tf.float32, shape=(None, None, self._encoder.input_dim)) latent_input = tf.compat.v1.placeholder( tf.float32, shape=(None, None, self._encoder.output_dim)) with tf.compat.v1.variable_scope(self._encoder.name): encoder_dist = self._encoder.build(encoder_input, name='encoder').dist with tf.compat.v1.variable_scope('concat_obs_latent'): obs_latent_input = tf.concat([obs_input, latent_input], -1) dist, mean_var, log_std_var = super().build( obs_latent_input, # Must named 'default' to # compensate tf default worker name='default').outputs embed_state_input = tf.concat([ obs_input, encoder_dist.sample(seed=deterministic.get_tf_seed_stream()) ], -1) dist_given_task, mean_g_t, log_std_g_t = super().build( embed_state_input, name='given_task').outputs self._f_dist_obs_latent = tf.compat.v1.get_default_session( ).make_callable([ dist.sample(seed=deterministic.get_tf_seed_stream()), mean_var, log_std_var ], feed_list=[obs_input, latent_input]) self._f_dist_obs_task = tf.compat.v1.get_default_session( ).make_callable([ dist_given_task.sample(seed=deterministic.get_tf_seed_stream()), mean_g_t, log_std_g_t ], feed_list=[obs_input, encoder_input])
def __init__(self, env_spec, name='CategoricalMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): if not isinstance(env_spec.action_space, akro.Discrete): raise ValueError('CategoricalMLPPolicy only works' 'with akro.Discrete action space.') self._env_spec = env_spec self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.n self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self._f_prob = None super().__init__(output_dim=self._action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization, name=name) self._initialize()
def __init__(self, env_spec, name='ContinuousMLPQFunction', hidden_sizes=(32, 32), action_merge_layer=-2, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): self._env_spec = env_spec self._hidden_sizes = hidden_sizes self._action_merge_layer = action_merge_layer self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim super().__init__(name=name, output_dim=1, hidden_sizes=hidden_sizes, concat_layer=self._action_merge_layer, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) self._network = None self._initialize()
def __init__(self, env_spec, num_seq_inputs=1, name='ContinuousMLPBaseline', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, normalize_inputs=True): self._env_spec = env_spec self._normalize_inputs = normalize_inputs self._name = name if optimizer_args is None: optimizer_args = dict() if optimizer is None: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_dim=1, name=name, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init) self._x_mean = None self._x_std = None self._y_hat = None self._initialize()
def cnn(input_var, filters, strides, name, padding, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer()): """Convolutional neural network (CNN). Note: Based on 'NHWC' data format: [batch, height, width, channel]. Args: input_var (tf.Tensor): Input tf.Tensor to the CNN. filters (Tuple[Tuple[int, Tuple[int, int]], ...]): Number and dimension of filters. For example, ((3, (3, 5)), (32, (3, 3))) means there are two convolutional layers. The filter for the first layer have 3 channels and its shape is (3 x 5), while the filter for the second layer have 32 channels and its shape is (3 x 3). strides (tuple[int]): The stride of the sliding window. For example, (1, 2) means there are two convolutional layers. The stride of the filter for first layer is 1 and that of the second layer is 2. name (str): Network name, also the variable scope. padding (str): The type of padding algorithm to use, either 'SAME' or 'VALID'. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. Return: tf.Tensor: The output tf.Tensor of the CNN. """ with tf.compat.v1.variable_scope(name): h = input_var for index, (filter_iter, stride) in enumerate(zip(filters, strides)): _stride = [1, stride, stride, 1] h = _conv(h, 'h{}'.format(index), filter_iter[1], filter_iter[0], _stride, hidden_w_init, hidden_b_init, padding) if hidden_nonlinearity is not None: h = hidden_nonlinearity(h) # flatten dim = tf.reduce_prod(h.get_shape()[1:].as_list()) return tf.reshape(h, [-1, dim])
def _initialize_policy(self): """Initialize policy.""" state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, self._obs_dim)) dist, mean, log_std = self.build(state_input).outputs self._f_dist = tf.compat.v1.get_default_session().make_callable( [ dist.sample(seed=deterministic.get_tf_seed_stream()), mean, log_std ], feed_list=[state_input])
def __init__(self, output_dim, hidden_dim, name=None, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), recurrent_nonlinearity=tf.nn.sigmoid, recurrent_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), hidden_state_init=tf.zeros_initializer(), hidden_state_init_trainable=False, cell_state_init=tf.zeros_initializer(), cell_state_init_trainable=False, forget_bias=True, layer_normalization=False): super().__init__( output_dim=output_dim, hidden_dim=hidden_dim, name=name, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, recurrent_nonlinearity=recurrent_nonlinearity, recurrent_w_init=recurrent_w_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, hidden_state_init=hidden_state_init, hidden_state_init_trainable=hidden_state_init_trainable, cell_state_init=cell_state_init, cell_state_init_trainable=cell_state_init_trainable, forget_bias=forget_bias, layer_normalization=layer_normalization)
def __init__(self, output_dim, name='MLPModel', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): super().__init__(name) self._output_dim = output_dim self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization
def __init__(self, input_dim, output_dim, filters, strides, padding, name=None, is_image=True, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): super().__init__(name) self._is_image = is_image self._cnn_model = CNNModel(input_dim=input_dim, filters=filters, strides=strides, padding=padding, hidden_nonlinearity=hidden_nonlinearity, name='CNNModel') self._mlp_model = CategoricalMLPModel( output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization, name='MLPModel')
def __init__(self, env_spec, name='ContinuousMLPPolicy', hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=tf.nn.tanh, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): self._env_spec = env_spec action_dim = env_spec.action_space.flat_dim self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self._obs_dim = env_spec.observation_space.flat_dim super().__init__(output_dim=action_dim, name=name, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) self._initialize()
def _initialize(self): """Initialize encoder.""" embedding_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, self._input_dim), name='default_encoder') with tf.compat.v1.variable_scope(self._name) as vs: self._variable_scope = vs self._network = self.model.build(embedding_input) self._f_dist = tf.compat.v1.get_default_session().make_callable( [ self._network.dist.sample( seed=deterministic.get_tf_seed_stream()), self._network.mean, self._network.log_std ], feed_list=[embedding_input])
def __init__(self, filters, strides, padding, name=None, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer()): super().__init__(name) self._filters = filters self._strides = strides self._padding = padding self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init
def _initialize(self): """Initialize policy.""" state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None) + self._obs_dim) if isinstance(self.env_spec.observation_space, akro.Image): augmented_state_input = tf.cast(state_input, tf.float32) augmented_state_input /= 255.0 else: augmented_state_input = state_input dist = self.build(augmented_state_input).outputs self._f_prob = tf.compat.v1.get_default_session().make_callable( [ tf.argmax(dist.sample(seed=deterministic.get_tf_seed_stream()), -1), dist.probs ], feed_list=[state_input])
def __init__(self, input_dim, filters, strides, name=None, padding='SAME', pool_strides=(2, 2), pool_shapes=(2, 2), hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer()): super().__init__(name) self._input_dim = input_dim self._filters = filters self._strides = strides self._padding = padding self._pool_strides = pool_strides self._pool_shapes = pool_shapes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init
def _build(self, state_input, name=None): """Build model given input placeholder(s). Args: state_input (tf.Tensor): Place holder for state input. name (str): Inner model name, also the variable scope of the inner model, if exist. One example is garage.tf.models.Sequential. Return: tf.Tensor: Sampled action. tf.Tensor: Mean. tf.Tensor: Parameterized log_std. tf.Tensor: log_std. tfp.distributions.MultivariateNormalDiag: Distribution. """ del name action_dim = self._output_dim with tf.compat.v1.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an CNN b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_conv = cnn( input_var=state_input, filters=self._filters, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, strides=self._strides, padding=self._padding, name='mean_std_cnn') mean_std_network = mlp( mean_std_conv, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.compat.v1.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.compat.v1.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_conv = cnn(input_var=state_input, filters=self._filters, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, strides=self._strides, padding=self._padding, name='mean_cnn') mean_network = mlp( mean_conv, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_conv = cnn( input_var=state_input, filters=self._std_filters, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, strides=self._std_strides, padding=self._std_padding, name='log_std_cnn') log_std_network = mlp( log_std_conv, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( input_var=state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') mean_var = mean_network std_param = log_std_network with tf.compat.v1.variable_scope('std_limits'): if self._min_std_param is not None: std_param = tf.maximum(std_param, self._min_std_param) if self._max_std_param is not None: std_param = tf.minimum(std_param, self._max_std_param) with tf.compat.v1.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.math.log(tf.math.log(1. + tf.exp(std_param))) dist = tfp.distributions.MultivariateNormalDiag( loc=mean_var, scale_diag=tf.exp(log_std_var)) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=deterministic.get_tf_seed_stream()) action_var = rnd * tf.exp(log_std_var) + mean_var return action_var, mean_var, log_std_var, std_param, dist
def __init__(self, env_spec, name='CategoricalLSTMPolicy', hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), recurrent_nonlinearity=tf.nn.sigmoid, recurrent_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_nonlinearity=tf.nn.softmax, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), hidden_state_init=tf.zeros_initializer(), hidden_state_init_trainable=False, cell_state_init=tf.zeros_initializer(), cell_state_init_trainable=False, state_include_action=True, forget_bias=True, layer_normalization=False): if not isinstance(env_spec.action_space, akro.Discrete): raise ValueError('CategoricalLSTMPolicy only works' 'with akro.Discrete action space.') self._env_spec = env_spec self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.n self._hidden_dim = hidden_dim self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._recurrent_nonlinearity = recurrent_nonlinearity self._recurrent_w_init = recurrent_w_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._hidden_state_init = hidden_state_init self._hidden_state_init_trainable = hidden_state_init_trainable self._cell_state_init = cell_state_init self._cell_state_init_trainable = cell_state_init_trainable self._forget_bias = forget_bias self._layer_normalization = layer_normalization self._state_include_action = state_include_action if state_include_action: self._input_dim = self._obs_dim + self._action_dim else: self._input_dim = self._obs_dim self._f_step_prob = None super().__init__( output_dim=self._action_dim, hidden_dim=self._hidden_dim, name=name, forget_bias=forget_bias, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, recurrent_nonlinearity=recurrent_nonlinearity, recurrent_w_init=recurrent_w_init, hidden_state_init=hidden_state_init, hidden_state_init_trainable=hidden_state_init_trainable, cell_state_init=cell_state_init, cell_state_init_trainable=cell_state_init_trainable, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) self._prev_actions = None self._prev_hiddens = None self._prev_cells = None self._init_hidden = None self._init_cell = None self._initialize_policy()
def __init__(self, env_spec, hidden_dim=32, name='GaussianGRUPolicy', hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), recurrent_nonlinearity=tf.nn.sigmoid, recurrent_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), hidden_state_init=tf.zeros_initializer(), hidden_state_init_trainable=False, learn_std=True, std_share_network=False, init_std=1.0, layer_normalization=False, state_include_action=True): if not isinstance(env_spec.action_space, akro.Box): raise ValueError('GaussianGRUPolicy only works with ' 'akro.Box action space, but not {}'.format( env_spec.action_space)) self._env_spec = env_spec self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._hidden_dim = hidden_dim self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._recurrent_nonlinearity = recurrent_nonlinearity self._recurrent_w_init = recurrent_w_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._hidden_state_init = hidden_state_init self._hidden_state_init_trainable = hidden_state_init_trainable self._learn_std = learn_std self._std_share_network = std_share_network self._init_std = init_std self._layer_normalization = layer_normalization self._state_include_action = state_include_action if state_include_action: self._input_dim = self._obs_dim + self._action_dim else: self._input_dim = self._obs_dim self._f_step_mean_std = None super().__init__( output_dim=self._action_dim, hidden_dim=hidden_dim, name=name, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, recurrent_nonlinearity=recurrent_nonlinearity, recurrent_w_init=recurrent_w_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, hidden_state_init=hidden_state_init, hidden_state_init_trainable=hidden_state_init_trainable, layer_normalization=layer_normalization, learn_std=learn_std, std_share_network=std_share_network, init_std=init_std) self._prev_actions = None self._prev_hiddens = None self._init_hidden = None self._initialize_policy()
def _build_entropy_terms(self, i): """Build policy entropy tensor. Args: i (namedtuple): Collection of variables to compute policy loss. Returns: tf.Tensor: Policy entropy. """ pol_dist = self._policy_network.dist infer_dist = self._infer_network.dist enc_dist = self._encoder_network.dist with tf.name_scope('entropy_terms'): # 1. Encoder distribution total entropy with tf.name_scope('encoder_entropy'): encoder_dist, _, _ = self.policy.encoder.build( i.task_var, name='encoder_entropy').outputs encoder_all_task_entropies = -encoder_dist.log_prob( i.latent_var) if self._use_softplus_entropy: encoder_entropy = tf.nn.softplus( encoder_all_task_entropies) encoder_entropy = tf.reduce_mean(encoder_entropy, name='encoder_entropy') encoder_entropy = tf.stop_gradient(encoder_entropy) # 2. Infernece distribution cross-entropy (log-likelihood) with tf.name_scope('inference_ce'): # Build inference with trajectory windows traj_ll = infer_dist.log_prob( enc_dist.sample(seed=deterministic.get_tf_seed_stream()), name='traj_ll') inference_ce_raw = -traj_ll inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3) if self._use_softplus_entropy: inference_ce = tf.nn.softplus(inference_ce) if self._stop_ce_gradient: inference_ce = tf.stop_gradient(inference_ce) # 3. Policy path entropies with tf.name_scope('policy_entropy'): policy_entropy = -pol_dist.log_prob(i.action_var, name='policy_log_likeli') # This prevents entropy from becoming negative # for small policy std if self._use_softplus_entropy: policy_entropy = tf.nn.softplus(policy_entropy) policy_entropy = tf.stop_gradient(policy_entropy) # Diagnostic functions self._f_task_entropies = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_all_task_entropies) self._f_encoder_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), encoder_entropy) self._f_inference_ce = compile_function( flatten_inputs(self._policy_opt_inputs), tf.reduce_mean(inference_ce * i.valid_var)) self._f_policy_entropy = compile_function( flatten_inputs(self._policy_opt_inputs), policy_entropy) return encoder_entropy, inference_ce, policy_entropy
def __init__(self, env_spec, filters, strides, hidden_sizes=(256, ), action_merge_layer=-2, name=None, padding='SAME', max_pooling=False, pool_strides=(2, 2), pool_shapes=(2, 2), cnn_hidden_nonlinearity=tf.nn.relu, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): if (not isinstance(env_spec.observation_space, akro.Box) or not len(env_spec.observation_space.shape) in (2, 3)): raise ValueError( '{} can only process 2D, 3D akro.Image or' ' akro.Box observations, but received an env_spec with ' 'observation_space of type {} and shape {}'.format( type(self).__name__, type(env_spec.observation_space).__name__, env_spec.observation_space.shape)) self._env_spec = env_spec self._filters = filters self._strides = strides self._hidden_sizes = hidden_sizes self._action_merge_layer = action_merge_layer self._padding = padding self._max_pooling = max_pooling self._pool_strides = pool_strides self._pool_shapes = pool_shapes self._cnn_hidden_nonlinearity = cnn_hidden_nonlinearity self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self._obs_dim = self._env_spec.observation_space.shape self._action_dim = self._env_spec.action_space.shape super().__init__(name=name, filters=self._filters, strides=self._strides, hidden_sizes=self._hidden_sizes, action_merge_layer=self._action_merge_layer, padding=self._padding, max_pooling=self._max_pooling, pool_strides=self._pool_strides, pool_shapes=self._pool_shapes, cnn_hidden_nonlinearity=self._cnn_hidden_nonlinearity, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization) self._initialize()
def __init__(self, env_spec, name='GaussianMLPPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): if not isinstance(env_spec.action_space, akro.Box): raise ValueError('GaussianMLPPolicy only works with ' 'akro.Box action space, but not {}'.format( env_spec.action_space)) self._env_spec = env_spec self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_output_nonlinearity = std_output_nonlinearity self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization self._f_dist = None super().__init__(output_dim=self._action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name=name) self._initialize_policy()
def __init__(self, embedding_spec, name='GaussianMLPEncoder', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): super().__init__(name) self._embedding_spec = embedding_spec self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_output_nonlinearity = std_output_nonlinearity self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization self._latent_dim = embedding_spec.output_space.flat_dim self._input_dim = embedding_spec.input_space.flat_dim self._network = None self._f_dist = None self.model = GaussianMLPModel( output_dim=self._latent_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name='GaussianMLPModel') self._initialize()
def mlp(input_var, output_dim, hidden_sizes, name, input_var2=None, concat_layer=-2, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), layer_normalization=False): """Multi-layer perceptron (MLP). It maps real-valued inputs to real-valued outputs. Args: input_var (tf.Tensor): Input tf.Tensor to the MLP. output_dim (int): Dimension of the network output. hidden_sizes (list[int]): Output dimension of dense layer(s). For example, (32, 32) means this MLP consists of two hidden layers, each with 32 hidden units. name (str): Network name, also the variable scope. input_var2 (tf.Tensor): Second input tf.Tensor to the MLP if input needs to be concatenated with a layer in the model. concat_layer (int): The index of layers at which to concatenate input_var2 with the network. If input_var2 is not supplied, this arguments is ignored. The indexing works like standard python list indexing. Index of 0 refers to the input layer (input_var) while an index of -1 points to the last hidden layer. Default parameter points to second layer from the end. If the model has only one layer, input_var2 is concatenated with that layer. hidden_nonlinearity (callable): Activation function for intermediate dense layer(s). It should return a tf.Tensor. Set it to None to maintain a linear activation. hidden_w_init (callable): Initializer function for the weight of intermediate dense layer(s). The function should return a tf.Tensor. hidden_b_init (callable): Initializer function for the bias of intermediate dense layer(s). The function should return a tf.Tensor. output_nonlinearity (callable): Activation function for output dense layer. It should return a tf.Tensor. Set it to None to maintain a linear activation. output_w_init (callable): Initializer function for the weight of output dense layer(s). The function should return a tf.Tensor. output_b_init (callable): Initializer function for the bias of output dense layer(s). The function should return a tf.Tensor. layer_normalization (bool): Bool for using layer normalization or not. Return: tf.Tensor: The output tf.Tensor of the MLP. """ n_layers = len(hidden_sizes) + 1 _merge_inputs = False if input_var2 is not None: _merge_inputs = True if n_layers > 1: _concat_layer = (concat_layer % n_layers + n_layers) % n_layers else: _concat_layer = 0 with tf.compat.v1.variable_scope(name): l_hid = input_var for idx, hidden_size in enumerate(hidden_sizes): if _merge_inputs and idx == _concat_layer: l_hid = tf.keras.layers.concatenate([l_hid, input_var2]) l_hid = tf.compat.v1.layers.dense(inputs=l_hid, units=hidden_size, activation=hidden_nonlinearity, kernel_initializer=hidden_w_init, bias_initializer=hidden_b_init, name='hidden_{}'.format(idx)) if layer_normalization: l_hid = tf.keras.layers.LayerNormalization()(l_hid) if _merge_inputs and _concat_layer == len(hidden_sizes): l_hid = tf.keras.layers.concatenate([l_hid, input_var2]) l_out = tf.compat.v1.layers.dense(inputs=l_hid, units=output_dim, activation=output_nonlinearity, kernel_initializer=output_w_init, bias_initializer=output_b_init, name='output') return l_out
def __init__(self, env_spec, filters, strides, hidden_sizes=(256, ), name=None, padding='SAME', max_pooling=False, pool_strides=(2, 2), pool_shapes=(2, 2), cnn_hidden_nonlinearity=tf.nn.relu, hidden_nonlinearity=tf.nn.relu, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), dueling=False, layer_normalization=False): if not isinstance(env_spec.observation_space, akro.Box) or \ not len(env_spec.observation_space.shape) in (2, 3): raise ValueError( '{} can only process 2D, 3D akro.Image or' ' akro.Box observations, but received an env_spec with ' 'observation_space of type {} and shape {}'.format( type(self).__name__, type(env_spec.observation_space).__name__, env_spec.observation_space.shape)) self._env_spec = env_spec self._action_dim = env_spec.action_space.n self._filters = filters self._strides = strides self._hidden_sizes = hidden_sizes self._padding = padding self._max_pooling = max_pooling self._pool_strides = pool_strides self._pool_shapes = pool_shapes self._cnn_hidden_nonlinearity = cnn_hidden_nonlinearity self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self._dueling = dueling self.obs_dim = self._env_spec.observation_space.shape action_dim = self._env_spec.action_space.flat_dim if not max_pooling: cnn_model = CNNModel(input_dim=self.obs_dim, filters=filters, strides=strides, padding=padding, hidden_nonlinearity=cnn_hidden_nonlinearity) else: cnn_model = CNNModelWithMaxPooling( input_dim=self.obs_dim, filters=filters, strides=strides, padding=padding, pool_strides=pool_strides, pool_shapes=pool_shapes, hidden_nonlinearity=cnn_hidden_nonlinearity) if not dueling: output_model = MLPModel(output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) else: output_model = MLPDuelingModel( output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, layer_normalization=layer_normalization) super().__init__(cnn_model, output_model, name=name) self._network = None self._initialize()
def __init__(self, env_spec, num_seq_inputs=1, name='GaussianMLPBaseline', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0): self._env_spec = env_spec self._num_seq_inputs = num_seq_inputs self._use_trust_region = use_trust_region self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._subsample_factor = subsample_factor if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLBFGSOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LBFGSOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(name=name, input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_dim=1, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_nonlinearity, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=layer_normalization) # model for old distribution, used when trusted region is on self._old_model = self.clone_model(name=name + '_old_model') self._x_mean = None self._x_std = None self._y_mean = None self._y_std = None self._old_network = None self._initialize()
def __init__(self, output_dim, name=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_hidden_b_init=tf.zeros_initializer(), std_output_nonlinearity=None, std_output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_parameterization='exp', layer_normalization=False): # Network parameters super().__init__(name) self._hidden_sizes = hidden_sizes self._output_dim = output_dim self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._std_hidden_sizes = std_hidden_sizes self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_hidden_w_init = std_hidden_w_init self._std_hidden_b_init = std_hidden_b_init self._std_output_nonlinearity = std_output_nonlinearity self._std_output_w_init = std_output_w_init self._std_parameterization = std_parameterization self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization # Tranform std arguments to parameterized space self._init_std_param = None self._min_std_param = None self._max_std_param = None # pylint: disable=assignment-from-no-return if self._std_parameterization == 'exp': self._init_std_param = np.log(init_std) if min_std is not None: self._min_std_param = np.log(min_std) if max_std is not None: self._max_std_param = np.log(max_std) elif self._std_parameterization == 'softplus': self._init_std_param = np.log(np.exp(init_std) - 1) if min_std is not None: self._min_std_param = np.log(np.exp(min_std) - 1) if max_std is not None: self._max_std_param = np.log(np.exp(max_std) - 1) else: raise ValueError("std parameterization should be or 'exp' or " "'softplus' but got {}".format( self._std_parameterization))
def __init__(self, output_dim, filters, strides, padding, hidden_sizes, name=None, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_filters=(), std_strides=(), std_padding='SAME', std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_hidden_b_init=tf.zeros_initializer(), std_output_nonlinearity=None, std_output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_parameterization='exp', layer_normalization=False): # Network parameters super().__init__(name) self._output_dim = output_dim self._filters = filters self._strides = strides self._padding = padding self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_filters = std_filters self._std_strides = std_strides self._std_padding = std_padding self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_hidden_w_init = std_hidden_w_init self._std_hidden_b_init = std_hidden_b_init self._std_output_nonlinearity = std_output_nonlinearity self._std_output_w_init = std_output_w_init self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization # Tranform std arguments to parameterized space self._init_std_param = None self._min_std_param = None self._max_std_param = None if self._std_parameterization == 'exp': self._init_std_param = np.log(init_std) if min_std is not None: self._min_std_param = np.log(min_std) if max_std is not None: self._max_std_param = np.log(max_std) elif self._std_parameterization == 'softplus': self._init_std_param = np.log(np.exp(init_std) - 1) if min_std is not None: self._min_std_param = np.log(np.exp(min_std) - 1) if max_std is not None: self._max_std_param = np.log(np.exp(max_std) - 1) else: raise NotImplementedError
def __init__(self, env_spec, encoder, name='GaussianMLPTaskEmbeddingPolicy', hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_output_nonlinearity=None, std_parameterization='exp', layer_normalization=False): assert isinstance(env_spec.action_space, akro.Box) assert not isinstance(env_spec.observation_space, akro.Dict) self._env_spec = env_spec self._name = name self._encoder = encoder self._augmented_observation_space = akro.concat( self._env_spec.observation_space, self.task_space) self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._learn_std = learn_std self._adaptive_std = adaptive_std self._std_share_network = std_share_network self._init_std = init_std self._min_std = min_std self._max_std = max_std self._std_hidden_sizes = std_hidden_sizes self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_output_nonlinearity = std_output_nonlinearity self._std_parameterization = std_parameterization self._layer_normalization = layer_normalization self.obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim super().__init__(output_dim=self.action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name=name) self._initialize()
def __init__(self, env_spec, filters, strides, padding, hidden_sizes, hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), name='GaussianCNNBaseline', learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_filters=(), std_strides=(), std_padding='SAME', std_hidden_sizes=(), std_hidden_nonlinearity=None, std_output_nonlinearity=None, layer_normalization=False, normalize_inputs=True, normalize_outputs=True, subsample_factor=1., optimizer=None, optimizer_args=None, use_trust_region=True, max_kl_step=0.01): if not isinstance(env_spec.observation_space, akro.Box) or \ not len(env_spec.observation_space.shape) in (2, 3): raise ValueError( '{} can only process 2D, 3D akro.Image or' ' akro.Box observations, but received an env_spec with ' 'observation_space of type {} and shape {}'.format( type(self).__name__, type(env_spec.observation_space).__name__, env_spec.observation_space.shape)) self._env_spec = env_spec self._use_trust_region = use_trust_region self._subsample_factor = subsample_factor self._max_kl_step = max_kl_step self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs if optimizer_args is None: optimizer_args = dict() if optimizer is None: if use_trust_region: self._optimizer = make_optimizer(PenaltyLbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args) else: self._optimizer = make_optimizer(optimizer, **optimizer_args) super().__init__(input_shape=env_spec.observation_space.shape, output_dim=1, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=None, max_std=None, std_filters=std_filters, std_strides=std_strides, std_padding=std_padding, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_output_nonlinearity=std_output_nonlinearity, std_parameterization='exp', layer_normalization=layer_normalization, name=name) # model for old distribution, used when trusted region is on self._old_model = self.clone_model(name=name + '_old_model') self._old_network = None self._x_mean = None self._x_std = None self._y_mean = None self._y_std = None self._initialize()
def __init__(self, input_shape, output_dim, filters, strides, padding, hidden_sizes, name='GaussianCNNRegressorModel', hidden_nonlinearity=tf.nn.tanh, hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), hidden_b_init=tf.zeros_initializer(), output_nonlinearity=None, output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), output_b_init=tf.zeros_initializer(), learn_std=True, adaptive_std=False, std_share_network=False, init_std=1.0, min_std=1e-6, max_std=None, std_filters=(), std_strides=(), std_padding='SAME', std_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.tanh, std_hidden_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_hidden_b_init=tf.zeros_initializer(), std_output_nonlinearity=None, std_output_w_init=tf.initializers.glorot_uniform( seed=deterministic.get_tf_seed_stream()), std_parameterization='exp', layer_normalization=False): super().__init__(output_dim=output_dim, filters=filters, strides=strides, padding=padding, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, adaptive_std=adaptive_std, std_share_network=std_share_network, init_std=init_std, min_std=min_std, max_std=max_std, std_filters=std_filters, std_strides=std_strides, std_padding=std_padding, std_hidden_sizes=std_hidden_sizes, std_hidden_nonlinearity=std_hidden_nonlinearity, std_hidden_w_init=std_hidden_w_init, std_hidden_b_init=std_hidden_b_init, std_output_nonlinearity=std_output_nonlinearity, std_output_w_init=std_output_w_init, std_parameterization=std_parameterization, layer_normalization=layer_normalization, name=name) self._input_shape = input_shape