def initialize(self, inputs, losses, constraints, target, givens=None, lr_mult=1): self._target = target loss = sum(losses) params = target.get_params(trainable=True) gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore') # Phase 1: Compute gradient and save to GPU vector flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients) self._shared_grad = shared_grad # Phase 2: All-reduce gradient in-place in shared_grad, then reshape gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params) self._avg_factor_var = avg_factor_var # (set later as 1 / n_gpu) # Phase 3: Apply combined gradient locally gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip) lr = self._learning_rate * lr_mult # (lr_mult can be shared variable) updates = self._update_method(gradients, params, learning_rate=lr) self._f_grad = ext.compile_function( inputs=inputs, outputs=loss, updates=[flat_update], givens=givens, log_name="gradient", ) self._f_update = ext.compile_function( inputs=[], outputs=grad_norm, updates=updates, log_name="update", )
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target updates = self._update_method(loss, target.get_params(trainable=True)) updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in updates.iteritems()]) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function(inputs + extra_inputs, loss), f_opt=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, updates=updates, ))
def initialize(self, env_spec, **kwargs): # Wait to do this until GPU is initialized. assert isinstance(env_spec.action_space, Discrete) s = retrieve_args(self) network = PgCnn( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=s.conv_filters, conv_filter_sizes=s.conv_filter_sizes, conv_strides=s.conv_strides, conv_pads=s.conv_pads, hidden_sizes=s.hidden_sizes, hidden_nonlinearity=s.hidden_nonlinearity, output_pi_nonlinearity=s.output_pi_nonlinearity, pixel_scale=s.pixel_scale, name="atari_cnn", ) self._l_obs = network.input_layer input_var = network.input_layer.input_var prob, value = L.get_output(network.output_layers) self._f_prob = ext.compile_function([input_var], prob) self._f_value = ext.compile_function([input_var], value) self._f_prob_value = ext.compile_function([input_var], [prob, value]) self._dist = Categorical(env_spec.action_space.n) self._network = network super().initialize(env_spec, network=network, **kwargs) self.param_short_names = \ [shorten_param_name(p.name) for p in network.get_params(trainable=True)]
def init_opt(self): # First, create "target" policy and Q functions target_policy = pickle.loads(pickle.dumps(self.policy)) target_qf = pickle.loads(pickle.dumps(self.qf)) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = TT.vector('ys') qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([TT.sum(TT.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = TT.mean(TT.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([TT.sum(TT.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True) policy_surr = -TT.mean(policy_qval) policy_reg_surr = policy_surr + policy_weight_decay_term qf_updates = self.qf_update_method(qf_reg_loss, self.qf.get_params(trainable=True)) policy_updates = self.policy_update_method( policy_reg_surr, self.policy.get_params(trainable=True)) f_train_qf = ext.compile_function(inputs=[yvar, obs, action], outputs=[qf_loss, qval], updates=qf_updates) f_train_policy = ext.compile_function(inputs=[obs], outputs=policy_surr, updates=policy_updates) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )
def initialize(self, inputs, loss, target, priority_expr, givens=None, lr_mult=1): self._target = target params = target.get_params(trainable=True) gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore") if self._scale_conv_grads: gradients = scale_conv_gradients(params, gradients, scale=2 ** (-1 / 2)) # Compute gradient and save to GPU vector. flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients) self._shared_grad = shared_grad # All-reduce gradient in-place in shared_grad, then reshape gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params) self._avg_factor_var = avg_factor_var gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip) lr = self._learning_rate * lr_mult # (lr_mult can be shared variable) updates = self._update_method(gradients, params, learning_rate=lr) self._f_gradient = ext.compile_function( inputs=inputs, outputs=[priority_expr, loss], updates=[flat_update], givens=givens, log_name="gradient", ) self._f_update = ext.compile_function( inputs=[], updates=updates, log_name="update", )
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target updates = self._update_method(loss, target.get_params(trainable=True)) updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in list(updates.items())]) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function(inputs + extra_inputs, loss), f_opt=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, updates=updates, ) )
def initialize(self, inputs, losses, constraints, target, givens=None, lr_mult=1): self._target = target loss = sum(losses) params = target.get_params(trainable=True) gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore') gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip) lr = self._learning_rate * lr_mult # (lr_mult can be shared variable) updates = self._update_method(gradients, params, learning_rate=lr) # Prepare to load data onto GPU (and shuffle indexes there). load_updates, givens, opt_inputs = make_shared_inputs( inputs, self._shuffle) self._f_load = ext.compile_function( inputs=inputs, updates=load_updates, log_name="load", ) self._f_opt = ext.compile_function( inputs=opt_inputs, outputs=[loss, grad_norm], updates=updates, givens=givens, log_name="grad_and_update", )
def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target def get_opt_output(): flat_grad = flatten_tensor_variables(theano.grad(loss, target.get_params(trainable=True))) return [loss.astype('float64'), flat_grad.astype('float64')] if extra_inputs is None: extra_inputs = list() self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), f_opt=lambda: compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ) )
def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target def get_opt_output(): flat_grad = flatten_tensor_variables( theano.grad(loss, target.get_params(trainable=True))) return [loss.astype('float64'), flat_grad.astype('float64')] if extra_inputs is None: extra_inputs = list() self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), f_opt=lambda: compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def initialize(self, env_spec, alternating_sampler=False): # Wait to do this until GPU is initialized. assert isinstance(env_spec.action_space, Discrete) s = retrieve_args(self) network = PgCnnRnn( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=s.conv_filters, conv_filter_sizes=s.conv_filter_sizes, conv_strides=s.conv_strides, conv_pads=s.conv_pads, conv_nonlinearity=s.conv_nonlinearity, hidden_sizes=s.hidden_sizes, hidden_nonlinearity=s.hidden_nonlinearity, output_pi_nonlinearity=s.output_pi_nonlinearity, pixel_scale=s.pixel_scale, name="atari_cnn_rnn", ) self._l_obs = network.input_layer input_var = network.input_layer.input_var prev_hidden_vars = [lay.input_var for lay in network.prev_hidden_layers] prob, value = L.get_output(network.output_layers, step_or_train="step") hidden_vars = L.get_output(network.recurrent_layers, step_or_train="step") self._f_act = ext.compile_function( inputs=[input_var] + prev_hidden_vars, outputs=[prob, value] + hidden_vars, ) self._f_prob_value = ext.compile_function( inputs=[input_var] + prev_hidden_vars, outputs=[prob, value], ) self._f_prob = ext.compile_function( inputs=[input_var] + prev_hidden_vars, outputs=prob, ) self._f_value = ext.compile_function( inputs=[input_var] + prev_hidden_vars, outputs=value, ) self._f_hidden = ext.compile_function( inputs=[input_var] + prev_hidden_vars, outputs=hidden_vars, ) self._dist = Categorical(env_spec.action_space.n) super().initialize(env_spec, network, alternating_sampler) self.param_short_names = \ [shorten_param_name(p.name) for p in network.get_params(trainable=True)] self._network = network self._hprev_keys = ["hprev_{}".format(i) for i in range(len(network.recurrent_layers))] self.hid_init_params = self._network.hid_init_params
def update_opt( self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs ): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params) flat_grad = ext.flatten_tensor_variables(grads) constraint_grads = theano.grad(constraint_term, wrt=params) xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) Hx_plain_splits = TT.grad(TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]), wrt=params) Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name if self._debug_nan: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) else: mode = None self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", mode=mode ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", mode=mode ), f_Hx_plain=lambda: ext.compile_function( inputs=inputs + extra_inputs + xs, outputs=Hx_plain, log_name="f_Hx_plain", mode=mode ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", mode=mode ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", mode=mode ), )
def initialize(self, inputs, losses, constraints, target, lr_mult=1): self._target = target loss = sum(losses) params = target.get_params(trainable=True) gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore') gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip) # Phase 1: Compute gradient and save to GPU vector flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients) # Phase 2: apply gradient chunks to update central params; e.g. rmsprop lr = self._learning_rate * lr_mult if self.n_update_chunks > 1: updates_args = (shared_grad, lr, self.n_update_chunks, self._update_method_args) chunk_inputs, outputs_list, updates, idxs = \ chunked_updates(self._update_method_name, updates_args) self._chunk_idxs = idxs else: whole_inputs, whole_outputs, whole_updates = \ whole_update(self._update_method_name, shared_grad, lr, self._update_method_args) # Phase 3: copy new param values from shared_grad to params copy_updates = copy_params_from_flat(params, shared_grad) # Phase 1 self._f_gradient = ext.compile_function( inputs=inputs, outputs=[loss, grad_norm], updates=[flat_update], log_name="gradient", ) # Phase 2 if self.n_update_chunks > 1: f_update_chunks = list() for i, (outputs, update) in enumerate(zip(outputs_list, updates)): f_update_chunks.append(ext.compile_function( inputs=chunk_inputs, outputs=outputs, updates=[update], log_name="update_chunk_{}".format(i)) ) self._f_update_chunks = f_update_chunks else: self._f_update = ext.compile_function( inputs=whole_inputs, outputs=whole_outputs, updates=whole_updates, log_name="update", ) # Phase 3 self._f_copy = ext.compile_function( inputs=[], updates=copy_updates, log_name="copy_params", )
def __init__(self, _p, inputs, s, costs, h=None, ha=None): '''Constructs and compiles the necessary Theano functions. p : list of Theano shared variables Parameters of the model to be optimized. inputs : list of Theano variables Symbolic variables that are inputs to your graph (they should also include your model 'output'). Your training examples must fit these. s : Theano variable Symbolic variable with respect to which the Hessian of the objective is positive-definite, implicitly defining the Gauss-Newton matrix. Typically, it is the activation of the output layer. costs : list of Theano variables Monitoring costs, the first of which will be the optimized objective. h: Theano variable or None Structural damping is applied to this variable (typically the hidden units of an RNN). ha: Theano variable or None Symbolic variable that implicitly defines the Gauss-Newton matrix for the structural damping term (typically the activation of the hidden layer). If None, it will be set to `h`.''' self.p = _p self.shapes = [i.get_value().shape for i in _p] self.sizes = map(numpy.prod, self.shapes) self.positions = numpy.cumsum([0] + self.sizes)[:-1] g = T.grad(costs[0], _p) g = map(T.as_tensor_variable, g) # for CudaNdarray self.f_gc = compile_function(inputs, g + costs) # during gradient computation self.f_cost = compile_function(inputs, costs) # for quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 v = [symbolic_types[len(i)]() for i in self.shapes] Gv = gauss_newton_product(costs[0], _p, v, s) coefficient = T.scalar() # this is lambda*mu if h is not None: # structural damping with cross-entropy h_constant = symbolic_types[h.ndim]( ) # T.Rop does not support `consider_constant` yet, so use `givens` structural_damping = coefficient * ( -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0] if ha is None: ha = h Gv_damping = gauss_newton_product(structural_damping, _p, v, ha) Gv = [a + b for a, b in zip(Gv, Gv_damping)] givens = {h_constant: h} else: givens = {} self.function_Gv = compile_function(inputs + v + [coefficient], Gv, givens=givens)
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(grads) self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", ), )
def __init__( self, env_spec, hidden_sizes=(), hidden_nonlinearity=NL.tanh, num_seq_inputs=1, neat_output_dim=20, neat_network=None, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) # create random NEAT MLP if neat_network is None: neat_network = MLP( input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,), output_dim=neat_output_dim, hidden_sizes=(12, 12), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.identity, ) if prob_network is None: prob_network = MLP( input_shape=(L.get_output_shape(neat_network.output_layer)[1],), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._phi = neat_network.output_layer self._obs = neat_network.input_layer self._neat_output = ext.compile_function([neat_network.input_layer.input_var], L.get_output(neat_network.output_layer)) self.prob_network = prob_network self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(PowerGradientPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__(self, _p, inputs, s, costs, h=None, ha=None): '''Constructs and compiles the necessary Theano functions. p : list of Theano shared variables Parameters of the model to be optimized. inputs : list of Theano variables Symbolic variables that are inputs to your graph (they should also include your model 'output'). Your training examples must fit these. s : Theano variable Symbolic variable with respect to which the Hessian of the objective is positive-definite, implicitly defining the Gauss-Newton matrix. Typically, it is the activation of the output layer. costs : list of Theano variables Monitoring costs, the first of which will be the optimized objective. h: Theano variable or None Structural damping is applied to this variable (typically the hidden units of an RNN). ha: Theano variable or None Symbolic variable that implicitly defines the Gauss-Newton matrix for the structural damping term (typically the activation of the hidden layer). If None, it will be set to `h`.''' self.p = _p self.shapes = [i.get_value().shape for i in _p] self.sizes = list(map(numpy.prod, self.shapes)) self.positions = numpy.cumsum([0] + self.sizes)[:-1] g = T.grad(costs[0], _p) g = list(map(T.as_tensor_variable, g)) # for CudaNdarray self.f_gc = compile_function(inputs, g + costs) # during gradient computation self.f_cost = compile_function(inputs, costs) # for quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 v = [symbolic_types[len(i)]() for i in self.shapes] Gv = gauss_newton_product(costs[0], _p, v, s) coefficient = T.scalar() # this is lambda*mu if h is not None: # structural damping with cross-entropy h_constant = symbolic_types[h.ndim]() # T.Rop does not support `consider_constant` yet, so use `givens` structural_damping = coefficient * ( -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0] if ha is None: ha = h Gv_damping = gauss_newton_product(structural_damping, _p, v, ha) Gv = [a + b for a, b in zip(Gv, Gv_damping)] givens = {h_constant: h} else: givens = {} self.function_Gv = compile_function(inputs + v + [coefficient], Gv, givens=givens)
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ constraint_terms = [c[0] for c in leq_constraint] constraint_values = [c[1] for c in leq_constraint] penalty_var = TT.scalar("penalty") penalty_loss = constraint_terms[0] for i in range(1, len(constraint_terms)): penalty_loss += constraint_terms[i] penalized_loss = loss + penalty_var * penalty_loss self._target = target self._max_constraint_vals = np.array(constraint_values) self._constraint_name = constraint_name def get_opt_output(): flat_grad = flatten_tensor_variables( theano.grad(penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore')) return [ penalized_loss.astype('float64'), flat_grad.astype('float64') ] self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"), f_constraint=lambda: compile_function( inputs, penalty_loss, log_name="f_constraint"), f_penalized_loss=lambda: compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss] + constraint_terms, log_name="f_penalized_loss", ), f_opt=lambda: compile_function(inputs=inputs + [penalty_var], outputs=get_opt_output(), log_name="f_opt"))
def update_opt(self, loss, target, inputs, extra_inputs=None, diagnostic_vars=None, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self.target = target if diagnostic_vars is None: diagnostic_vars = OrderedDict() params = target.get_params(trainable=True) gradients = theano.grad(loss, params, disconnected_inputs='ignore') if self.gradient_clipping is not None: gradients = [ TT.clip(g, -self.gradient_clipping, self.gradient_clipping) for g in gradients ] updates = self._update_method(gradients, target.get_params(trainable=True)) updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in updates.items()]) if extra_inputs is None: extra_inputs = list() self.input_vars = inputs + extra_inputs self.f_train = ext.compile_function( inputs=self.input_vars, outputs=[loss] + list(diagnostic_vars.values()), updates=updates, ) self.f_loss_diagostics = ext.compile_function( inputs=self.input_vars, outputs=[loss] + list(diagnostic_vars.values()), ) self.diagnostic_vars = diagnostic_vars
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) def Hx_plain(): Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn' ) return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: ext.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def initialize(self, inputs, loss, target, priority_expr, givens=None, lr_mult=1): self._target = target params = target.get_params(trainable=True) gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore") if self._scale_conv_grads: # (for dueling network architecture) gradients = scale_conv_gradients(params, gradients, scale=2**(-1 / 2)) gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip) lr = self._learning_rate * lr_mult # (lr_mult can be shared variable) updates = self._update_method(gradients, params, learning_rate=lr) self._f_opt = ext.compile_function( inputs=inputs, outputs=[priority_expr, loss], updates=updates, givens=givens, log_name="grad_and_update", )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX ) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = - TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict( f_kl=f_kl, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -TT.sum( logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = -TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var] if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def initialize(self, env_spec, **kwargs): # Wait to do this until GPU is initialized assert isinstance(env_spec.action_space, Discrete) s = retrieve_args(self) network_args = dict( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=s.conv_filters, conv_filter_sizes=s.conv_filter_sizes, conv_strides=s.conv_strides, conv_pads=s.conv_pads, hidden_sizes=s.hidden_sizes, hidden_nonlinearity=s.hidden_nonlinearity, pixel_scale=s.pixel_scale, dueling=s.dueling, shared_last_bias=s.shared_last_bias, ) policy_network = DqnCnn(name="policy", **network_args) target_network = DqnCnn(name="target", **network_args) self._l_obs = policy_network.input_layer self._l_target_obs = target_network.input_layer self._policy_network = policy_network self._target_network = target_network input_var = policy_network.input_layer.input_var q = L.get_output(policy_network.output_layer) self._f_q = ext.compile_function([input_var], q) greedy_action = T.argmax(q, axis=1) self._f_a = ext.compile_function([input_var], greedy_action) target_input_var = target_network.input_layer.input_var target_q = L.get_output(target_network.output_layer) self._f_target_q = ext.compile_function([target_input_var], target_q) policy_params = policy_network.get_params(trainable=True) target_params = target_network.get_params(trainable=True) updates = [(t, p) for p, t in zip(policy_params, target_params)] self._f_update_target = ext.compile_function(inputs=[], updates=updates) self._f_update_target() super().initialize(env_spec, network=policy_network, **kwargs) self._epsilon = s.epsilon self.param_short_names = \ [shorten_param_name(p.name) for p in policy_params]
def __init__(self, wrapped_constraint, env_spec, yield_zeros_until=1, optimizer=None, hidden_sizes=(32,), hidden_nonlinearity=NL.sigmoid, lag_time=10, coeff=1., filter_bonuses=False, max_epochs=25, *args, **kwargs): Serializable.quick_init(self,locals()) self._wrapped_constraint = wrapped_constraint self._env_spec = env_spec self._filter_bonuses = filter_bonuses self._yield_zeros_until = yield_zeros_until self._hidden_sizes = hidden_sizes self._lag_time = lag_time self._coeff = coeff self._max_epochs = max_epochs self.use_bonus = True if optimizer is None: #optimizer = LbfgsOptimizer() optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None) self._optimizer = optimizer obs_dim = env_spec.observation_space.flat_dim predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid, input_shape=(obs_dim,)) LasagnePowered.__init__(self, [predictor_network.output_layer]) x_var = predictor_network.input_layer.input_var y_var = TT.matrix("ys") out_var = L.get_output(predictor_network.output_layer, {predictor_network.input_layer: x_var}) regression_loss = TT.mean(TT.square(y_var - out_var)) optimizer_args = dict( loss=regression_loss, target=self, inputs=[x_var, y_var], ) self._optimizer.update_opt(**optimizer_args) self._f_predict = compile_function([x_var],out_var) self._fit_steps = 0 self.has_baseline = self._wrapped_constraint.has_baseline if self.has_baseline: self.baseline = self._wrapped_constraint.baseline
def __init__(self, env_spec, hidden_sizes=(32, ), state_include_action=True, hidden_nonlinearity=NL.tanh, output_b_init=None, weight_signal=1.0, weight_nonsignal=1.0, weight_smc=1.0): """ :param env_spec: A spec for the env. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(InitCategoricalGRUPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 output_b_init = compute_output_b_init(env_spec.action_space.names, output_b_init, weight_signal, weight_nonsignal, weight_smc) if state_include_action: input_shape = (env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim, ) else: input_shape = (env_spec.observation_space.flat_dim, ) prob_network = InitGRUNetwork(input_shape=input_shape, output_dim=env_spec.action_space.n, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, output_b_init=output_b_init) self._prob_network = prob_network self._state_include_action = state_include_action self._f_step_prob = ext.compile_function( [ prob_network.step_input_layer.input_var, prob_network.step_prev_hidden_layer.input_var ], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ])) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentCategorical(env_spec.action_space.n) self.reset() LasagnePowered.__init__(self, [prob_network.output_layer])
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(grads) self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", ), )
def __init__(self, env_spec, tau=0.9): self._baseline = theano.shared(0.0, name="baseline") return_mean = T.scalar("empirical_return_mean") updated_baseline = \ tau * self._baseline \ + (1 - tau) * return_mean self._update_baseline = compile_function( inputs=[return_mean], updates={self._baseline: updated_baseline})
def incorporate_z(self, z): """ Called from the algo while initializing """ self.z = z # (a Theano shared variable, 1-D tensor) assert len(z.get_value()) == self.n_atoms actions = self.actions_sym() obs_var = self._l_obs.input_var self._f_a = ext.compile_function( inputs=[obs_var], outputs=actions, )
def __init__( self, env_spec, hidden_sizes=(32,), state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 if state_include_action: input_shape = (env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim,) else: input_shape = (env_spec.observation_space.flat_dim,) prob_network = GRUNetwork( input_shape=input_shape, output_dim=env_spec.action_space.n, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._prob_network = prob_network self._state_include_action = state_include_action self._f_step_prob = ext.compile_function( [ prob_network.step_input_layer.input_var, prob_network.step_prev_hidden_layer.input_var ], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ]) ) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentCategorical(env_spec.action_space.n) self.reset() LasagnePowered.__init__(self, [prob_network.output_layer])
def test_gru_network(): from rllab.core.network import GRUNetwork import lasagne.layers as L from rllab.misc import ext import numpy as np network = GRUNetwork( input_shape=(2, 3), output_dim=5, hidden_dim=4, ) f_output = ext.compile_function(inputs=[network.input_layer.input_var], outputs=L.get_output(network.output_layer)) assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, hidden_W_init=LI.HeUniform(), hidden_b_init=LI.Constant(0.), output_nonlinearity=NL.tanh, output_W_init=LI.Uniform(-3e-3, 3e-3), output_b_init=LI.Uniform(-3e-3, 3e-3), bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim)) l_hidden = l_obs if bn: l_hidden = batch_norm(l_hidden) for idx, size in enumerate(hidden_sizes): l_hidden = L.DenseLayer( l_hidden, num_units=size, W=hidden_W_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % idx ) if bn: l_hidden = batch_norm(l_hidden) l_output = L.DenseLayer( l_hidden, num_units=env_spec.action_space.flat_dim, W=output_W_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output" ) # Note the deterministic=True argument. It makes sure that when getting # actions from single observations, we do not update params in the # batch normalization layers action_var = L.get_output(l_output, deterministic=True) self._output_layer = l_output self._f_actions = ext.compile_function([l_obs.input_var], action_var) super(DeterministicMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [l_output])
def __init__( self, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax, prob_network=None, name=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec if prob_network is None: if not name: name = "categorical_conv_prob_network" prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__( self, env_spec, latent_dim=0, # all this is fake latent_name='categorical', bilinear_integration=False, resample=False, # until here hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ #bullshit self.latent_dim = latent_dim ##could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self._set_std_to_0 = False # self._set_std_to_0 = True Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) self._layers = prob_network.layers # Rui: added layers for function get_params() super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ constraint_term, constraint_value = leq_constraint penalty_var = TT.scalar("penalty") penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): flat_grad = flatten_tensor_variables(theano.grad( penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore' )) return [penalized_loss.astype('float64'), flat_grad.astype('float64')] self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"), f_constraint=lambda: compile_function(inputs, constraint_term, log_name="f_constraint"), f_penalized_loss=lambda: compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], log_name="f_penalized_loss", ), f_opt=lambda: compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), log_name="f_opt" ) )
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, num_seq_inputs=1, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Product) self._Da = env_spec.action_space.comp_dim self._actnum = len(self._Da) self._slice = [0] + np.cumsum(self._Da).tolist() if prob_network is None: prob_network = MLP2( input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_sizes=self._Da, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) #self._l_prob = prob_network.output_layer self._l_prob = prob_network.output_layer #self._logits = [prob_network.output[:, self._slice[i]:self._slice[i + 1]] for i in range(self._actnum)] #l_probs = [NL.softmax(logit) for logit in self._logits] #self._l_prob = it.product(l_probs) self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) # self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output( # self._l_probs)) self._dist = Categorical2(self._Da) super(CategoricalMLPPolicy2, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def update_opt(self, loss, target, inputs, extra_inputs=None, gradients=None, **kwargs): self._target = target if gradients is None: gradients = theano.grad(loss, target.get_params(trainable=True), disconnected_inputs='ignore') flat_grad = ext.flatten_tensor_variables(gradients) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict(f_loss=lambda: ext.compile_function( inputs + extra_inputs, loss, log_name=self._name + "_f_loss"), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name=self._name + "_f_grad"))
def test_gru_network(): from rllab.core.network import GRUNetwork import lasagne.layers as L from rllab.misc import ext import numpy as np network = GRUNetwork( input_shape=(2, 3), output_dim=5, hidden_dim=4, ) f_output = ext.compile_function( inputs=[network.input_layer.input_var], outputs=L.get_output(network.output_layer) ) assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
def init_opt(self): """ Initialize the optimization procedure. If using theano, this may include declaring all the variables and compiling functions. """ # specify target policy target_policy = pickle.loads(pickle.dumps(self.policy)) target_var = TT.vector('target', dtype=theano.config.floatX) # building network obs_var = self.env.observation_space.new_tensor_variable('obs', extra_dims=1) action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1) _, qval_var_all_dict = self.policy.get_action_sym(obs_var) qval_var_all = qval_var_all_dict["action_values"] qval_var = qval_var_all[TT.arange(qval_var_all.shape[0]), action_var] #gradient clipping diff = target_var - qval_var if self.clip_gradient > 0: quadratic_part = TT.minimum(abs(diff), self.clip_gradient) linear_part = abs(diff) - quadratic_part loss = 0.5 * TT.square(quadratic_part) + self.clip_gradient * linear_part else: loss = 0.5 * TT.square(diff) loss_var = TT.mean(loss) params = self.policy.get_params(trainable=True) updates = self.update_method(loss_var, params) # debugging functions # also uncomment mode=theano.compile.MonitorMode(pre_func=inspect_inputs, post_func=inspect_outputs) # def inspect_inputs(i, node, fn): # print(i, node, "input(s) shape(s):", [input[0].shape for input in fn.inputs],end='') # def inspect_outputs(i, node, fn): # print(" output(s) shape(s):", [output[0].shape for output in fn.outputs]) f_train_policy = ext.compile_function( inputs=[obs_var, action_var, target_var], outputs=[loss_var, qval_var], updates=updates, name='f_train_policy', # mode=theano.compile.MonitorMode(pre_func=inspect_inputs, post_func=inspect_outputs) ) self.opt_info = dict( f_train_policy=f_train_policy, target_policy=target_policy )
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(constraint_grads) def f_Hx_plain(*args): inputs_ = args[:len(inputs)] xs = args[len(inputs):] flat_xs = np.concatenate([np.reshape(x, (-1,)) for x in xs]) param_val = self.target.get_param_values(trainable=True) eps = np.cast['float32']( self.base_eps / (np.linalg.norm(param_val) + 1e-8)) self.target.set_param_values( param_val + eps * flat_xs, trainable=True) flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_) if self.symmetric: self.target.set_param_values( param_val - eps * flat_xs, trainable=True) flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self.target.set_param_values(param_val, trainable=True) else: self.target.set_param_values(param_val, trainable=True) flat_grad = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad) / eps return hx self.opt_fun = ext.lazydict( f_grad=lambda: ext.compile_function( inputs=inputs, outputs=flat_grad, log_name="f_grad", ), f_Hx_plain=lambda: f_Hx_plain, )
def update_opt(self, loss, target, inputs, network_outputs, extra_inputs=None): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target if extra_inputs is None: extra_inputs = list() self._hf_optimizer = hf_optimizer( _p=target.get_params(trainable=True), inputs=(inputs + extra_inputs), s=network_outputs, costs=[loss], ) self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared( np.zeros((1,) + input_shape), name="x_mean", broadcastable=(True,) + (False, ) * len(input_shape) ) x_std_var = theano.shared( np.ones((1,) + input_shape), name="x_std", broadcastable=(True,) + (False, ) * len(input_shape) ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, hidden_W_init=lasagne.init.HeUniform(), hidden_b_init=lasagne.init.Constant(0.), action_merge_layer=-2, output_nonlinearity=None, output_W_init=lasagne.init.Uniform(-3e-3, 3e-3), output_b_init=lasagne.init.Uniform(-3e-3, 3e-3), bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer( l_hidden, num_units=size, W=hidden_W_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1) ) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer( l_hidden, num_units=1, W=output_W_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output" ) output_var = L.get_output(l_output, deterministic=True).flatten() self._f_qval = ext.compile_function([l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LasagnePowered.__init__(self, [l_output])
def __init__( self, env_spec, hidden_sizes=(32,), state_include_action=True, hidden_nonlinearity=NL.tanh, learn_std=True, init_std=1.0, output_nonlinearity=None, ): """ :param env_spec: A spec for the env. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ Serializable.quick_init(self, locals()) super(GaussianGRUPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 if state_include_action: obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim else: obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim mean_network = GRUNetwork( input_shape=(obs_dim,), output_dim=action_dim, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_mean = mean_network.output_layer obs_var = mean_network.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = ParamLayer( mean_network.step_input_layer, num_units=action_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._l_log_std = l_log_std self._state_include_action = state_include_action self._f_step_mean_std = ext.compile_function( [ mean_network.step_input_layer.input_var, mean_network.step_prev_hidden_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer ]) ) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentDiagonalGaussian(action_dim) self.reset() LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
def __init__( self, name, input_shape, output_dim, hidden_sizes, conv_filters,conv_filter_sizes,conv_strides,conv_pads, hidden_nonlinearity=NL.rectify, mean_network=None, optimizer=None, use_trust_region=True, step_size=0.01, subsample_factor=1.0, batchsize=None, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_conv_filters=[],std_conv_filters_sizes=[],std_conv_strides=[],std_conv_pads=[], std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, ): """ :param input_shape: usually for images of the form (width,height,channel) :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self.input_shape = input_shape if mean_network is None: mean_network = ConvNetwork( name="mean_network", input_shape=input_shape, output_dim=output_dim, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = ConvNetwork( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, conv_filters=std_conv_filters, conv_filter_sizes=std_conv_filter_sizes, conv_strides=std_conv_strides, conv_pads=std_conv_pads, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=output_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LasagnePowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = TT.matrix("ys") old_means_var = TT.matrix("old_means") old_log_stds_var = TT.matrix("old_log_stds") x_mean_var = theano.shared( np.zeros((1,np.prod(input_shape)), dtype=theano.config.floatX), name="x_mean", broadcastable=(True,False), ) x_std_var = theano.shared( np.ones((1,np.prod(input_shape)), dtype=theano.config.floatX), name="x_std", broadcastable=(True,False), ) y_mean_var = theano.shared( np.zeros((1, output_dim), dtype=theano.config.floatX), name="y_mean", broadcastable=(True, False) ) y_std_var = theano.shared( np.ones((1, output_dim), dtype=theano.config.floatX), name="y_std", broadcastable=(True, False) ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + TT.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict( mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = TT.mean(dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = - \ TT.mean(dist.log_likelihood_sym( normalized_ys_var, normalized_dist_info_vars)) self._f_predict = compile_function([xs_var], means_var) self._f_pdists = compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var self._subsample_factor = subsample_factor self._batchsize = batchsize
def build_model(self): # Prepare Theano variables for inputs and targets # Same input for classification as regression. input_var = T.matrix('inputs', dtype=theano.config.floatX) # @UndefinedVariable target_var = T.matrix('targets', dtype=theano.config.floatX) # @UndefinedVariable # Loss function. loss = self.loss(input_var, target_var) loss_only_last_sample = self.loss_last_sample(input_var, target_var) # Create update methods. params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.adam( loss, params, learning_rate=self.learning_rate) # Train/val fn. self.pred_fn = ext.compile_function( [input_var], self.pred_sym(input_var), log_name='pred_fn') self.train_fn = ext.compile_function( [input_var, target_var], loss, updates=updates, log_name='train_fn') if self.second_order_update: oldparams = lasagne.layers.get_all_params( self.network, oldparam=True) step_size = T.scalar('step_size', dtype=theano.config.floatX) # @UndefinedVariable def second_order_update(loss_or_grads, params, oldparams, step_size): """Second-order update method for optimizing loss_last_sample, so basically, KL term (new params || old params) + NLL of latest sample. The Hessian is evaluated at the origin and provides curvature information to make a more informed step in the correct descent direction.""" grads = T.grad(loss_or_grads, params) updates = OrderedDict() for i in xrange(len(params)): param = params[i] grad = grads[i] if param.name == 'mu' or param.name == 'b_mu': oldparam_rho = oldparams[i + 1] invH = T.square(T.log(1 + T.exp(oldparam_rho))) else: oldparam_rho = oldparams[i] p = param H = 2. * (T.exp(2 * p)) / \ (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2) invH = 1. / H updates[param] = param - step_size * invH * grad return updates def fast_kl_div(loss, params, oldparams, step_size): grads = T.grad(loss, params) kl_component = [] for i in xrange(len(params)): param = params[i] grad = grads[i] if param.name == 'mu' or param.name == 'b_mu': oldparam_rho = oldparams[i + 1] invH = T.square(T.log(1 + T.exp(oldparam_rho))) else: oldparam_rho = oldparams[i] p = param H = 2. * (T.exp(2 * p)) / \ (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2) invH = 1. / H kl_component.append( T.sum(T.square(step_size) * T.square(grad) * invH)) return sum(kl_component) compute_fast_kl_div = fast_kl_div( loss_only_last_sample, params, oldparams, step_size) self.train_update_fn = ext.compile_function( [input_var, target_var, step_size], compute_fast_kl_div, log_name='f_compute_fast_kl_div') # updates_kl = second_order_update( # loss_only_last_sample, params, oldparams, step_size) # # self.train_update_fn = ext.compile_function( # [input_var, target_var, step_size], loss_only_last_sample, updates=updates_kl, log_name='train_update_fn') else: self.train_update_fn = ext.compile_function( [input_var, target_var], loss_only_last_sample, updates=updates, log_name='train_update_fn') # called kl div closed form but should be called surprise self.f_kl_div_closed_form = ext.compile_function( [], self.surprise(), log_name='kl_div_fn')
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = ext.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = ext.new_tensor( 'feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX ) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: ext.new_tensor( k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in self.policy.state_info_keys } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = - TT.sum(logli * TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var) / TT.sum(valid_var) else: loss = - TT.mean(logli * TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) )) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params] ) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad( loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [rewards, obs_var, feat_diff, action_var] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = ext.compile_function( inputs=input, outputs=loss, ) f_loss_grad = ext.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: ext.new_tensor( 'old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX ) for k in dist.dist_info_keys } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] if is_recurrent: mean_kl = TT.sum(dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = ext.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual ) f_dual_grad = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad ) self.opt_info = dict( f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl )
def __init__( self, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer( shape=(None, None, input_dim), name="input" ) if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: TT.reshape( flat_feature, [input.shape[0], input.shape[1], feature_dim] ), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim) ) prob_network = GRUNetwork( input_shape=(feature_dim,), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=TT.nnet.softmax, name="prob_network" ) self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = TT.matrix("flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output(l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = ext.compile_function( [ flat_input_var, prob_network.step_prev_hidden_layer.input_var ], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var}) ) self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_action = None self.prev_hidden = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LasagnePowered.__init__(self, out_layers)
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network mean_network = MLP( input_shape=(obs_dim,), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_var if adaptive_std: std_network = MLP( input_shape=(obs_dim,), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self.min_std = min_std mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian() LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def init_opt(self): # First, create "target" policy and Q functions target_policy = pickle.loads(pickle.dumps(self.policy)) target_qf = pickle.loads(pickle.dumps(self.qf)) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = TT.vector('ys') qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([TT.sum(TT.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = TT.mean(TT.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([TT.sum(TT.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval = self.qf.get_qval_sym( obs, self.policy.get_action_sym(obs), deterministic=True ) policy_surr = -TT.mean(policy_qval) policy_reg_surr = policy_surr + policy_weight_decay_term qf_updates = self.qf_update_method( qf_reg_loss, self.qf.get_params(trainable=True)) policy_updates = self.policy_update_method( policy_reg_surr, self.policy.get_params(trainable=True)) f_train_qf = ext.compile_function( inputs=[yvar, obs, action], outputs=[qf_loss, qval], updates=qf_updates ) f_train_policy = ext.compile_function( inputs=[obs], outputs=policy_surr, updates=policy_updates ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )