Beispiel #1
0
    def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim,),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #2
0
    def __init__(self, wrapped_constraint, 
                       env_spec, 
                       yield_zeros_until=1,
                       optimizer=None, 
                       hidden_sizes=(32,), 
                       hidden_nonlinearity=NL.sigmoid, 
                       lag_time=10, 
                       coeff=1.,
                       filter_bonuses=False,
                       max_epochs=25,
                       *args, **kwargs):

        Serializable.quick_init(self,locals())

        self._wrapped_constraint = wrapped_constraint
        self._env_spec = env_spec
        self._filter_bonuses = filter_bonuses
        self._yield_zeros_until = yield_zeros_until
        self._hidden_sizes = hidden_sizes
        self._lag_time = lag_time
        self._coeff = coeff
        self._max_epochs = max_epochs
        self.use_bonus = True

        if optimizer is None:
            #optimizer = LbfgsOptimizer()
            optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None)

        self._optimizer = optimizer

        obs_dim = env_spec.observation_space.flat_dim

        predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid,
                                     input_shape=(obs_dim,))

        LasagnePowered.__init__(self, [predictor_network.output_layer])

        x_var = predictor_network.input_layer.input_var
        y_var = TT.matrix("ys")
        out_var = L.get_output(predictor_network.output_layer, 
                               {predictor_network.input_layer: x_var})

        regression_loss = TT.mean(TT.square(y_var - out_var))

        optimizer_args = dict(
            loss=regression_loss,
            target=self,
            inputs=[x_var, y_var],
        )

        self._optimizer.update_opt(**optimizer_args)
        self._f_predict = compile_function([x_var],out_var)

        self._fit_steps = 0

        self.has_baseline = self._wrapped_constraint.has_baseline
        if self.has_baseline:
            self.baseline = self._wrapped_constraint.baseline
Beispiel #3
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, ),
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh,
                 output_b_init=None,
                 weight_signal=1.0,
                 weight_nonsignal=1.0,
                 weight_smc=1.0):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(InitCategoricalGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        output_b_init = compute_output_b_init(env_spec.action_space.names,
                                              output_b_init, weight_signal,
                                              weight_nonsignal, weight_smc)

        if state_include_action:
            input_shape = (env_spec.observation_space.flat_dim +
                           env_spec.action_space.flat_dim, )
        else:
            input_shape = (env_spec.observation_space.flat_dim, )

        prob_network = InitGRUNetwork(input_shape=input_shape,
                                      output_dim=env_spec.action_space.n,
                                      hidden_dim=hidden_sizes[0],
                                      hidden_nonlinearity=hidden_nonlinearity,
                                      output_nonlinearity=NL.softmax,
                                      output_b_init=output_b_init)

        self._prob_network = prob_network
        self._state_include_action = state_include_action

        self._f_step_prob = ext.compile_function(
            [
                prob_network.step_input_layer.input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer, prob_network.step_hidden_layer
            ]))

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentCategorical(env_spec.action_space.n)

        self.reset()

        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #4
0
    def __init__(self,
                 output_dim,
                 hidden_sizes,
                 hidden_nonlinearity,
                 output_nonlinearity,
                 hidden_W_init=LI.GlorotUniform(),
                 hidden_b_init=LI.Constant(0.),
                 output_W_init=LI.GlorotUniform(),
                 output_b_init=LI.Constant(0.),
                 name=None,
                 input_var=None,
                 input_layer=None,
                 input_shape=None,
                 batch_norm=False):

        Serializable.quick_init(self, locals())

        if name is None:
            prefix = ""
        else:
            prefix = name + "_"

        if input_layer is None:
            l_in = L.InputLayer(shape=(None, ) + input_shape,
                                input_var=input_var)
        else:
            l_in = input_layer
        self._layers = [l_in]
        l_hid = l_in
        for idx, hidden_size in enumerate(hidden_sizes):
            l_hid = L.DenseLayer(
                l_hid,
                num_units=hidden_size,
                nonlinearity=hidden_nonlinearity,
                name="%shidden_%d" % (prefix, idx),
                W=hidden_W_init,
                b=hidden_b_init,
            )
            if batch_norm:
                l_hid = L.batch_norm(l_hid)
            self._layers.append(l_hid)

        l_out = L.DenseLayer(
            l_hid,
            num_units=output_dim,
            nonlinearity=output_nonlinearity,
            name="%soutput" % (prefix, ),
            W=output_W_init,
            b=output_b_init,
        )
        self._layers.append(l_out)
        self._l_in = l_in
        self._l_out = l_out
        # self._input_var = l_in.input_var
        self._output = L.get_output(l_out)
        LasagnePowered.__init__(self, [l_out])
Beispiel #5
0
 def initialize(self, env_spec, network):
     """Call this in subclass's initialize, after building the network"""
     StochasticPolicy.__init__(self, env_spec)
     LasagnePowered.__init__(self, network.output_layers)
     if self.initial_param_values is not None:
         print("\nSetting initial NN param values")
         print("param values before loading initial values: ",
             self.get_param_values()[:5])
         self.set_param_values(self.initial_param_values)
         print("param values after loading: ",
             self.get_param_values()[:5])
    def __init__(
            self,
            env_spec,
            hidden_sizes=(),
            hidden_nonlinearity=NL.tanh,
            num_seq_inputs=1,
            neat_output_dim=20,
            neat_network=None,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)
        # create random NEAT MLP
        if neat_network is None:
            neat_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
                output_dim=neat_output_dim,
                hidden_sizes=(12, 12),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.identity,
            )

        if prob_network is None:
            prob_network = MLP(
                input_shape=(L.get_output_shape(neat_network.output_layer)[1],),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._phi = neat_network.output_layer
        self._obs = neat_network.input_layer
        self._neat_output = ext.compile_function([neat_network.input_layer.input_var], L.get_output(neat_network.output_layer))

        self.prob_network = prob_network
        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(PowerGradientPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32,),
            state_include_action=True,
            hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            input_shape = (env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim,)
        else:
            input_shape = (env_spec.observation_space.flat_dim,)

        prob_network = GRUNetwork(
            input_shape=input_shape,
            output_dim=env_spec.action_space.n,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.softmax,
        )

        self._prob_network = prob_network
        self._state_include_action = state_include_action

        self._f_step_prob = ext.compile_function(
            [
                prob_network.step_input_layer.input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer,
                prob_network.step_hidden_layer
            ])
        )

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentCategorical(env_spec.action_space.n)

        self.reset()

        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #8
0
    def __init__(
        self,
        n_in,
        n_hidden,
        n_out,
        layers_type,
        n_batches,
        trans_func=lasagne.nonlinearities.rectify,
        out_func=lasagne.nonlinearities.linear,
        batch_size=100,
        n_samples=10,
        prior_sd=0.5,
        use_reverse_kl_reg=False,
        reverse_kl_reg_factor=0.1,
        likelihood_sd=5.0,
        second_order_update=False,
        learning_rate=0.0001,
        compression=False,
        information_gain=True,
    ):

        Serializable.quick_init(self, locals())
        assert len(layers_type) == len(n_hidden) + 1

        self.n_in = n_in
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.batch_size = batch_size
        self.transf = trans_func
        self.outf = out_func
        self.n_samples = n_samples
        self.prior_sd = prior_sd
        self.layers_type = layers_type
        self.n_batches = n_batches
        self.use_reverse_kl_reg = use_reverse_kl_reg
        self.reverse_kl_reg_factor = reverse_kl_reg_factor
        self.likelihood_sd = likelihood_sd
        self.second_order_update = second_order_update
        self.learning_rate = learning_rate
        self.compression = compression
        self.information_gain = information_gain

        assert self.information_gain or self.compression

        # Build network architecture.
        self.build_network()

        # Build model might depend on this.
        LasagnePowered.__init__(self, [self.network])

        # Compile theano functions.
        self.build_model()
Beispiel #9
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=LI.HeUniform(),
            hidden_b_init=LI.Constant(0.),
            output_nonlinearity=NL.tanh,
            output_W_init=LI.Uniform(-3e-3, 3e-3),
            output_b_init=LI.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))

        l_hidden = l_obs
        if bn:
            l_hidden = batch_norm(l_hidden)

        for idx, size in enumerate(hidden_sizes):
            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % idx
            )
            if bn:
                l_hidden = batch_norm(l_hidden)

        l_output = L.DenseLayer(
            l_hidden,
            num_units=env_spec.action_space.flat_dim,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        action_var = L.get_output(l_output, deterministic=True)
        self._output_layer = l_output

        self._f_actions = ext.compile_function([l_obs.input_var], action_var)

        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [l_output])
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=LI.HeUniform(),
            hidden_b_init=LI.Constant(0.),
            output_nonlinearity=NL.tanh,
            output_W_init=LI.Uniform(-3e-3, 3e-3),
            output_b_init=LI.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))

        l_hidden = l_obs
        if bn:
            l_hidden = batch_norm(l_hidden)

        for idx, size in enumerate(hidden_sizes):
            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % idx
            )
            if bn:
                l_hidden = batch_norm(l_hidden)

        l_output = L.DenseLayer(
            l_hidden,
            num_units=env_spec.action_space.flat_dim,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        action_var = L.get_output(l_output, deterministic=True)
        self._output_layer = l_output

        self._f_actions = ext.compile_function([l_obs.input_var], action_var)

        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [l_output])
    def __init__(
            self,
            env_spec,
            conv_filters, conv_filter_sizes, conv_strides, conv_pads,
            hidden_sizes=[],
            hidden_nonlinearity=NL.rectify,
            output_nonlinearity=NL.softmax,
            prob_network=None,
            name=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec

        if prob_network is None:
            if not name:
                name = "categorical_conv_prob_network"
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name=name,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
    def __init__(
        self,
        env_spec,
        latent_dim=0,  # all this is fake
        latent_name='categorical',
        bilinear_integration=False,
        resample=False,  # until here
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        #bullshit
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self._set_std_to_0 = False
        # self._set_std_to_0 = True

        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)
        self._layers = prob_network.layers  # Rui: added layers for function get_params()

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #13
0
    def __init__(self, n_in,
                 n_hidden,
                 n_out,
                 layers_type,
                 n_batches,
                 trans_func=lasagne.nonlinearities.rectify,
                 out_func=lasagne.nonlinearities.linear,
                 batch_size=100,
                 n_samples=10,
                 prior_sd=0.5,
                 use_reverse_kl_reg=False,
                 reverse_kl_reg_factor=0.1,
                 likelihood_sd=5.0,
                 second_order_update=False,
                 learning_rate=0.0001,
                 compression=False,
                 information_gain=True,
                 ):

        Serializable.quick_init(self, locals())
        assert len(layers_type) == len(n_hidden) + 1

        self.n_in = n_in
        self.n_hidden = n_hidden
        self.n_out = n_out
        self.batch_size = batch_size
        self.transf = trans_func
        self.outf = out_func
        self.n_samples = n_samples
        self.prior_sd = prior_sd
        self.layers_type = layers_type
        self.n_batches = n_batches
        self.use_reverse_kl_reg = use_reverse_kl_reg
        self.reverse_kl_reg_factor = reverse_kl_reg_factor
        self.likelihood_sd = likelihood_sd
        self.second_order_update = second_order_update
        self.learning_rate = learning_rate
        self.compression = compression
        self.information_gain = information_gain

        assert self.information_gain or self.compression

        # Build network architecture.
        self.build_network()

        # Build model might depend on this.
        LasagnePowered.__init__(self, [self.network])

        # Compile theano functions.
        self.build_model()
Beispiel #14
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        num_seq_inputs=1,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Product)
        self._Da = env_spec.action_space.comp_dim
        self._actnum = len(self._Da)
        self._slice = [0] + np.cumsum(self._Da).tolist()

        if prob_network is None:
            prob_network = MLP2(
                input_shape=(env_spec.observation_space.flat_dim *
                             num_seq_inputs, ),
                output_sizes=self._Da,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        #self._l_prob = prob_network.output_layer
        self._l_prob = prob_network.output_layer
        #self._logits = [prob_network.output[:, self._slice[i]:self._slice[i + 1]] for i in range(self._actnum)]
        #l_probs = [NL.softmax(logit) for logit in self._logits]
        #self._l_prob = it.product(l_probs)
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))
        # self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(
        #     self._l_probs))

        self._dist = Categorical2(self._Da)

        super(CategoricalMLPPolicy2, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.tanh,
            num_seq_inputs=1,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec
        
        # print( env_spec.observation_space.shape )


        q_network = MLP(
            input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
            output_dim=env_spec.action_space.n,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.linear,
            name=name
        )
        
        self._l_q = q_network.output_layer
        self._l_obs = q_network.input_layer
        self._f_q = ext.compile_function(
            [q_network.input_layer.input_var],
            L.get_output(q_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMlpQPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [q_network.output_layer])
Beispiel #16
0
    def __init__(self, output_dim, hidden_sizes, hidden_nonlinearity,
                 output_nonlinearity, hidden_W_init=LI.GlorotUniform(), hidden_b_init=LI.Constant(0.),
                 output_W_init=LI.GlorotUniform(), output_b_init=LI.Constant(0.),
                 name=None, input_var=None, input_layer=None, input_shape=None, batch_norm=False):

        Serializable.quick_init(self, locals())

        if name is None:
            prefix = ""
        else:
            prefix = name + "_"

        if input_layer is None:
            l_in = L.InputLayer(shape=(None,) + input_shape, input_var=input_var)
        else:
            l_in = input_layer
        self._layers = [l_in]
        l_hid = l_in
        for idx, hidden_size in enumerate(hidden_sizes):
            l_hid = L.DenseLayer(
                l_hid,
                num_units=hidden_size,
                nonlinearity=hidden_nonlinearity,
                name="%shidden_%d" % (prefix, idx),
                W=hidden_W_init,
                b=hidden_b_init,
            )
            if batch_norm:
                l_hid = L.batch_norm(l_hid)
            self._layers.append(l_hid)

        l_out = L.DenseLayer(
            l_hid,
            num_units=output_dim,
            nonlinearity=output_nonlinearity,
            name="%soutput" % (prefix,),
            W=output_W_init,
            b=output_b_init,
        )
        self._layers.append(l_out)
        self._l_in = l_in
        self._l_out = l_out
        # self._input_var = l_in.input_var
        self._output = L.get_output(l_out)
        LasagnePowered.__init__(self, [l_out])
Beispiel #17
0
    def __init__(
            self,
            name,
            env_spec,
            policies,
            mean_network=None,

    ):
        """
        :param name: base tf variable name
        :param env_spec: A spec for the mdp.
        :param policies: list of policies
        :return:
        """
        Serializable.quick_init(self, locals())

        self.policies = policies
        self.n = len(policies)
        self.training = True  # training vs testing
        self.env_spec = env_spec
        super(MultiMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [self.policies[0]._l_mean, self.policies[0]._l_log_std])
Beispiel #18
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #19
0
    def __init__(self,
                 env_spec,
                 conv_filters,
                 conv_filter_sizes,
                 conv_strides,
                 conv_pads,
                 hidden_sizes=[],
                 hidden_nonlinearity=NL.rectify,
                 output_nonlinearity=NL.linear,
                 network=None):
        Serializable.quick_init(self, locals())

        if network is None:
            network = ConvNetwork(input_shape=env_spec.observation_space.shape,
                                  output_dim=1,
                                  conv_filters=conv_filters,
                                  conv_filter_sizes=conv_filter_sizes,
                                  conv_strides=conv_strides,
                                  conv_pads=conv_pads,
                                  hidden_sizes=hidden_sizes,
                                  hidden_nonlinearity=hidden_nonlinearity,
                                  output_nonlinearity=output_nonlinearity,
                                  name="continuous_conv_v_function")

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        self._env_spec = env_spec
        self._output_layer = network.output_layer
        self._input_layer = network.input_layer
        self._f_values = ext.compile_function(inputs=[network.input_var],
                                              outputs=L.get_output(
                                                  network.output_layer,
                                                  deterministic=True))

        LasagnePowered.__init__(self, [network.output_layer])
Beispiel #20
0
    def __init__(
            self,
            disc_window,
            disc_joints_dim,
            iteration,
            a_max=0.7,
            a_min=0.0,
            batch_size = 64,
            iter_per_train = 10,
            decent_portion=0.8,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=NL.tanh,
            disc_network=None,
    ):  
        self.batch_size=64
        self.iter_per_train=10
        self.disc_window = disc_window
        self.disc_joints_dim = disc_joints_dim
        self.disc_dim = self.disc_window*self.disc_joints_dim
        self.end_iter = int(iteration*decent_portion)
        self.iter_count = 0
        out_dim = 1
        target_var = TT.ivector('targets')

        # create network
        if disc_network is None:
            disc_network = MLP(
                input_shape=(self.disc_dim,),
                output_dim=out_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )

        self._disc_network = disc_network

        disc_reward = disc_network.output_layer
        obs_var = disc_network.input_layer.input_var

        disc_var, = L.get_output([disc_reward])

        self._disc_var = disc_var

        LasagnePowered.__init__(self, [disc_reward])
        self._f_disc = ext.compile_function(
            inputs=[obs_var],
            outputs=[disc_var],
            log_name="f_discriminate_forward",
        )
        
        params = L.get_all_params(disc_network, trainable=True)
        loss = lasagne.objectives.categorical_crossentropy(disc_var, target_var).mean()
        updates = lasagne.updates.adam(loss, params, learning_rate=0.01)
        self._f_disc_train = ext.compile_function(
            inputs=[obs_var, target_var],
            outputs=[loss],
            updates=updates,
            log_name="f_discriminate_train"
        )

        self.data = self.load_data()
        self.a = np.linspace(a_min, a_max, self.end_iter)
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=lasagne.init.HeUniform(),
            hidden_b_init=lasagne.init.Constant(0.),
            action_merge_layer=-2,
            output_nonlinearity=None,
            output_W_init=lasagne.init.Uniform(-3e-3, 3e-3),
            output_b_init=lasagne.init.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % (idx + 1)
            )

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(
            l_hidden,
            num_units=1,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        output_var = L.get_output(l_output, deterministic=True).flatten()

        self._f_qval = ext.compile_function([l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LasagnePowered.__init__(self, [l_output])
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            hidden_sizes,
            conv_filters,conv_filter_sizes,conv_strides,conv_pads,
            hidden_nonlinearity=NL.rectify,
            mean_network=None,

            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            subsample_factor=1.0,
            batchsize=None,

            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_conv_filters=[],std_conv_filters_sizes=[],std_conv_strides=[],std_conv_pads=[],
            std_hidden_sizes=(32, 32),
            std_nonlinearity=None,
            normalize_inputs=True,
            normalize_outputs=True,
    ):
        """
        :param input_shape: usually for images of the form (width,height,channel)
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())


        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer("optimizer")
            else:
                optimizer = LbfgsOptimizer("optimizer")

        self._optimizer = optimizer

        self.input_shape = input_shape
        if mean_network is None:
            mean_network = ConvNetwork(
                name="mean_network",
                input_shape=input_shape,
                output_dim=output_dim,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = ConvNetwork(
                name="log_std_network",
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                conv_filters=std_conv_filters,
                conv_filter_sizes=std_conv_filter_sizes,
                conv_strides=std_conv_strides,
                conv_pads=std_conv_pads,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(
            np.zeros((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_mean",
            broadcastable=(True,False),
        )
        x_std_var = theano.shared(
            np.ones((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_std",
            broadcastable=(True,False),
        )
        y_mean_var = theano.shared(
            np.zeros((1, output_dim), dtype=theano.config.floatX),
            name="y_mean",
            broadcastable=(True, False)
        )
        y_std_var = theano.shared(
            np.ones((1, output_dim), dtype=theano.config.floatX),
            name="y_std",
            broadcastable=(True, False)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian(output_dim)

        normalized_dist_info_vars = dict(
            mean=normalized_means_var, log_std=normalized_log_stds_var)

        mean_kl = TT.mean(dist.kl_sym(
            dict(mean=normalized_old_means_var,
                 log_std=normalized_old_log_stds_var),
            normalized_dist_info_vars,
        ))

        loss = - \
            TT.mean(dist.log_likelihood_sym(
                normalized_ys_var, normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._mean_network = mean_network
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
        self._subsample_factor = subsample_factor
        self._batchsize = batchsize
Beispiel #23
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False,) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False,) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Beispiel #24
0
    def __init__(
        self,
        input_shape,
        output_dim,
        mean_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_nonlinearity=None,
        normalize_inputs=True,
        normalize_outputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self._optimizer = optimizer

        if mean_network is None:
            mean_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = MLP(
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))
        y_mean_var = theano.shared(np.zeros((1, output_dim)),
                                   name="y_mean",
                                   broadcastable=(True, False))
        y_std_var = theano.shared(np.ones((1, output_dim)),
                                  name="y_std",
                                  broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian()

        normalized_dist_info_vars = dict(mean=normalized_means_var,
                                         log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(mean=normalized_old_means_var,
                     log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = -TT.mean(
            dist.log_likelihood_sym(normalized_ys_var,
                                    normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
Beispiel #25
0
    def __init__(
        self,
        env_spec,
        env,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
        pkl_path=None,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        self.pkl_path = pkl_path

        if self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions from env!
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.bilinear_integration:
            obs_dim = self.obs_robot_dim + self.latent_dim +\
                      self.obs_robot_dim * self.latent_dim
        else:
            obs_dim = self.obs_robot_dim + self.latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        # for _ in range(10):
        #     print("OK!")
        # print(obs_dim)
        # print(env_spec.observation_space.flat_dim)
        # print(self.latent_dim)

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            log_std_network = MLP(input_shape=(obs_dim, ),
                                  input_var=obs_var,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if self.pkl_path:  # restore from pkl file
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
 def get_param_values(self, **tags):
     return LasagnePowered.get_param_values(self, **tags)
Beispiel #28
0
 def set_param_values(self, flattened_params, **tags):
     return LasagnePowered.set_param_values(self, flattened_params, **tags)
Beispiel #29
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            dist_cls=DiagonalGaussian,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Discrete)

        #obs_dim = env_spec.observation_space.flat_dim
        obs_dim = 6400
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim,),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim,),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, ),
        state_include_action=True,
        hidden_nonlinearity=NL.tanh,
        learn_std=True,
        init_std=1.0,
        output_nonlinearity=None,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        super(GaussianGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        mean_network = GRUNetwork(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = ext.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer, l_step_log_std,
                mean_network.step_hidden_layer
            ]))

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_dim)

        self.reset()
        self.set_greedy(False)
        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
Beispiel #31
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32,),
            state_include_action=True,
            hidden_nonlinearity=NL.tanh,
            learn_std=True,
            init_std=1.0,
            output_nonlinearity=None,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        super(GaussianGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        mean_network = GRUNetwork(
            input_shape=(obs_dim,),
            output_dim=action_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.softmax,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = ext.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer,
                l_step_log_std,
                mean_network.step_hidden_layer
            ])
        )

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_dim)

        self.reset()

        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
Beispiel #32
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        mean_network = MLP(
            input_shape=(obs_dim,),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        if adaptive_std:
            std_network = MLP(
                input_shape=(obs_dim,),
                input_layer=mean_network.input_layer,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
            )
            l_log_std = std_network.output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian()

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
    def __init__(
        self,
        input_shape,
        output_dim,
        predict_all=False,  # CF
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = GRUNetwork(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_dim=hidden_sizes[0],  # this gives 32 by default
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.itensor3("ys")
        old_prob_var = TT.tensor3("old_prob")

        x_mean_var = theano.shared(
            np.zeros(
                (
                    1,
                    1,
                ) + input_shape
            ),  # this syntax makes the shape (1,1,*input_shape,). The first is traj
            name="x_mean",
            broadcastable=(
                True,
                True,
            ) + (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((
            1,
            1,
        ) + input_shape),
                                  name="x_std",
                                  broadcastable=(
                                      True,
                                      True,
                                  ) + (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var_all = L.get_output(
            l_prob, {prob_network.input_layer: normalized_xs_var})

        if predict_all:
            prob_var = prob_var_all
        else:
            # take only last dim but keep the shape
            prob_var_last = TT.reshape(
                prob_var_all[:, -1, :],
                (TT.shape(prob_var_all)[0], 1, TT.shape(prob_var_all)[2]))
            # padd along the time dimension to obtain the same shape as before
            padded_prob_var = TT.tile(prob_var_last,
                                      (1, TT.shape(prob_var_all)[1], 1))
            # give it the standard name
            prob_var = padded_prob_var

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted_flat = special.to_onehot_sym(
            TT.flatten(TT.argmax(prob_var, axis=-1)), output_dim)
        predicted = TT.reshape(predicted_flat, TT.shape(prob_var))

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
    def __init__(
            self,
            env_spec,
            hidden_dim=32,
            feature_network=None,
            state_include_action=True,
            hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        if state_include_action:
            input_dim = obs_dim + action_dim
        else:
            input_dim = obs_dim

        l_input = L.InputLayer(
            shape=(None, None, input_dim),
            name="input"
        )

        if feature_network is None:
            feature_dim = input_dim
            l_flat_feature = None
            l_feature = l_input
        else:
            feature_dim = feature_network.output_layer.output_shape[-1]
            l_flat_feature = feature_network.output_layer
            l_feature = OpLayer(
                l_flat_feature,
                extras=[l_input],
                name="reshape_feature",
                op=lambda flat_feature, input: TT.reshape(
                    flat_feature,
                    [input.shape[0], input.shape[1], feature_dim]
                ),
                shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)
            )

        prob_network = GRUNetwork(
            input_shape=(feature_dim,),
            input_layer=l_feature,
            output_dim=env_spec.action_space.n,
            hidden_dim=hidden_dim,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=TT.nnet.softmax,
            name="prob_network"
        )

        self.prob_network = prob_network
        self.feature_network = feature_network
        self.l_input = l_input
        self.state_include_action = state_include_action

        flat_input_var = TT.matrix("flat_input")
        if feature_network is None:
            feature_var = flat_input_var
        else:
            feature_var = L.get_output(l_flat_feature, {feature_network.input_layer: flat_input_var})

        self.f_step_prob = ext.compile_function(
            [
                flat_input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer,
                prob_network.step_hidden_layer
            ], {prob_network.step_input_layer: feature_var})
        )

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim

        self.prev_action = None
        self.prev_hidden = None
        self.dist = RecurrentCategorical(env_spec.action_space.n)

        out_layers = [prob_network.output_layer]
        if feature_network is not None:
            out_layers.append(feature_network.output_layer)

        LasagnePowered.__init__(self, out_layers)
Beispiel #35
0
 def get_param_values(self, **tags):
     return LasagnePowered.get_param_values(self, **tags)
Beispiel #36
0
    def __init__(
        self,
        env_spec,
        zero_gradient_cutoff,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
        adversarial=True,
        eps=0.1,
        probability=0.0,
        use_dynamics=False,
        random=False,
        observable_noise=False,
        use_max_norm=True,
        record_traj=False,
        set_dynamics=None,
        mask_augmentation=False,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param dist_cls: defines probability distribution over actions

        The following parameters are specific to the AdversarialPolicy Class.

        :param adversarial: whether the policy should incorporate adversarial states during rollout
        :param eps: the strength of the adversarial perturbation
        :param probability: frequency of adversarial updates. If 0, do exactly one update at the beginning of
                            every episode
        :param use_dynamics: if True, generate adversarial dynamics updates, otherwise do adversarial state updates
        :param random: if True, use a random perturbation instead of an adversarial perturbation
        :param observable_noise: if True, don't set adversarial state in the environment, treat it as noise
                                 on observation
        :param zero_gradient_cutoff: determines cutoff index for zero-ing out gradients - this is useful when doing
                                     adversarial dynamics vs. adversarial states, when we only want to compute
                                     gradients for one section of the augmented state vector. We also use this to
                                     determine what the original, non-augmented state size is.
        :param use_max_norm: if True, use Fast Gradient Sign Method (FGSM) to generate adversarial perturbations, else
                             use full gradient ascent
        :param record_traj: if True, rollout dictionaries will contain qpos and qvel trajectories. This is useful for
                            plotting trajectories.
        :param set_dynamics: if provided, the next rollout initializes the environment to the passed dynamics.
        :param mask_augmentation: if True, don't augment the state (even though the environment augments the state with 
                                  the dynamics parameters, the policy will ignore these dimensions)

        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # TODO: make a more elegant solution to this
        # This is here because we assume the original, unaugmented state size is provided.
        assert (zero_gradient_cutoff is not None)

        # if we're ignoring state augmentation, modify observation size / network size accordingly
        if mask_augmentation:
            obs_dim = zero_gradient_cutoff

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        # take exponential for the actual standard dev
        self._tru_std_var = TT.exp(self._log_std_var)

        # take gradients of mean network, exponential of std network wrt L2 norm
        self._mean_grad = theano.grad(self._mean_var.norm(2), obs_var)
        self._std_grad = theano.grad(self._tru_std_var.norm(2),
                                     obs_var,
                                     disconnected_inputs='warn')

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(AdversarialPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        # function to get gradients
        self._f_grad_dist = ext.compile_function(
            inputs=[obs_var], outputs=[self._mean_grad, self._std_grad])

        # initialize adversarial parameters
        self.adversarial = adversarial
        self.eps = eps
        self.probability = probability
        self.use_dynamics = use_dynamics
        self.random = random
        self.observable_noise = observable_noise
        self.zero_gradient_cutoff = zero_gradient_cutoff
        self.use_max_norm = use_max_norm
        self.record_traj = record_traj
        self.set_dynamics = set_dynamics
        self.mask_augmentation = mask_augmentation
Beispiel #37
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
        hlc_output_dim=0,
        subnet_split1=[2, 3, 4, 11, 12, 13],
        subnet_split2=[5, 6, 7, 14, 15, 16],
        sub_out_dim=3,
        option_dim=4,
    ):

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = HMLPPhase(
                hidden_sizes,
                hidden_nonlinearity,
                input_shape=(obs_dim, ),
                subnet_split1=subnet_split1,
                subnet_split2=subnet_split2,
                hlc_output_dim=hlc_output_dim,
                sub_out_dim=sub_out_dim,
                option_dim=option_dim,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        self.hidden_signals = ext.compile_function(
            inputs=[obs_var],
            outputs=[
                mean_network.hlc_signal1, mean_network.hlc_signal2,
                mean_network.leg1_part, mean_network.leg2_part
            ])
Beispiel #38
0
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = CircleEnv(target_velocity=vv['target_velocity'],
                        radius=vv['radius'],
                        dt=vv['dt'],
                        model_type=vv['model_type'],
                        robot_type=vv['robot_type'])
    else:
        from aa_simulation.envs.circle.circle_env_ros import CircleEnvROS
        env = CircleEnvROS(target_velocity=vv['target_velocity'],
                           radius=vv['radius'],
                           dt=vv['dt'],
                           model_type=vv['model_type'],
                           robot_type=vv['robot_type'])

    # Save variant information for comparison plots
    variant_file = logger.get_snapshot_dir() + '/variant.json'
    logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        wheelbase = 0.257
        target_velocity = vv['target_velocity']
        target_steering = np.arctan(wheelbase / vv['radius'])  # CCW
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)

        mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ),
                           output_dim=env.spec.action_space.flat_dim,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=LN.tanh,
                           output_nonlinearity=None,
                           output_W_init=LI.GlorotUniform(gain=W_gain),
                           output_b_init=output_mean)
        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=hidden_sizes,
                                   init_std=init_std,
                                   mean_network=mean_network)
        baseline = LinearFeatureBaseline(env_spec=env.spec,
                                         target_key='returns')

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True)
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output(
            [policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(inputs=[obs_var],
                                              outputs=[mean_var, log_std_var])

    safety_baseline = LinearFeatureBaseline(env_spec=env.spec,
                                            target_key='safety_returns')

    safety_constraint = CircleSafetyConstraint(max_value=1.0,
                                               eps=vv['eps'],
                                               baseline=safety_baseline)

    if vv['algo'] == 'TRPO':
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=600,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
        )
    else:
        algo = CPO(env=env,
                   policy=policy,
                   baseline=baseline,
                   safety_constraint=safety_constraint,
                   batch_size=600,
                   max_path_length=env.horizon,
                   n_itr=600,
                   discount=0.99,
                   step_size=trpo_stepsize,
                   gae_lambda=0.95,
                   safety_gae_lambda=1,
                   optimizer_args={'subsample_factor': trpo_subsample_factor},
                   plot=False)
    algo.train()
 def set_param_values(self, flattened_params, **tags):
     return LasagnePowered.set_param_values(self, flattened_params, **tags)
Beispiel #40
0
    def __init__(
            self,
            env,
            obs_net_params,
            name,
            fusion_net_params=None,
            obs_indx=None,
            obs_shapes=None,
            action_dims=None,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_net_parameters=None,
            std_fusion_net_params=None,
            std_share_network=False,
            min_std=1e-6,
            mean_network=None,
            std_network=None,
            use_flat_obs=True,
            dist_cls=DiagonalGaussian
    ):
        """
        :param env:
        :param obs_net_params: (list of dict) parameters for the observation networks
        :param name: (str) name is essential and should be consistent everywhere. Policy uses it to access relevant data
        :param fusion_net_params: (dict) parameters of the fusion network. If single observation type is used you could set it None
        :param obs_indx: (list of int) if obs is provided as tuple, which indices in tuple to use. If None is given tries to use all indices
        :param obs_shapes: (list of tuples of int) observation shapes for manual assignment (in case some other observations might be used)
        :param action_dims: (list of int  or just int) number of actions, If None is provided - env actions are used. Providing a list of action dims allows also have different nonlinearities for every subset of actions
        :param std_net_parameters: (dict) parameters for the std network
        :param learn_std: Is std trainable (does not need std network parameters)
        :param init_std: Initial std
        :param adaptive_std: (bool) should std to be learnable. If True specify std_network_parameters
        :param std_share_network:
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param mean_network: custom network for the output mean
        :param use_flat_obs: if set then one must provide observations in flat form even if they are images (insde they are reshaped). For some reason it breaks if they are not flat, so use True for now
        :param std_network: custom network for the output log std
        :return:
        """
        self.env_observation_space = copy.deepcopy(env.observation_space)
        self.name = name
        self.obs_indx = obs_indx
        # Update observation indices and shapes
        self.obs_shapes = self.checkListOfTuples(obs_shapes)
        self.use_flat_obs = use_flat_obs

        Serializable.quick_init(self, locals())
        assert isinstance(env.action_space, Box)

        # If not specified - use action space of the environment
        pf.print_sec0('MULTIOBSERVATION POLICY:')
        if action_dims is None:
            assert env is not None, "ERROR: getNetwork(): Either provide env or output_dim"
            action_dims = np.prod(env.action_space.shape)

        # create network
        if mean_network is None:
            mean_network = self.getNetwork(obs_net_params=obs_net_params,
                                           fusion_net_params=fusion_net_params,
                                           obs_shapes=self.obs_shapes,
                                           output_dim=action_dims,
                                           name=name + '_mean_net',
                                           use_flat_obs=self.use_flat_obs,
                                           env=env)
            ## Typical parameters
            # hidden_sizes=(32, 32),
            # hidden_nonlinearity=NL.tanh,
            # output_nonlinearity=None,

        self._mean_network = mean_network
        l_mean = mean_network.output_layer

        self.input_vars = []
        for layer in mean_network.input_layers:
            self.input_vars.append(layer.input_var)

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = self.getNetwork(obs_net_params=std_net_parameters,
                                              fusion_net_params=std_fusion_net_params,
                                              obs_layers=mean_network.input_layers,
                                              output_dim=action_dims,
                                              env=env,
                                              use_flat_obs=self.use_flat_obs,
                                              name=name + '_std_net')
                ## Typical parameters:
                # input_shape=(obs_dim,)
                # std_hidden_sizes = (32,32)
                # std_hidden_nonlinearity = NL.tanh
                # output_nonlinearity=None
                l_log_std = std_network.output_layer
            else:
                action_nums = np.sum(action_dims)
                l_log_std = ParamMultiInLayer(
                    mean_network.input_layers,
                    num_units=action_nums,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self._std_network = std_network
        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dims)
        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMultiObsPolicy, self).__init__(env)

        self._f_dist = ext.compile_function(
            inputs=self.input_vars,
            outputs=[mean_var, log_std_var],
        )
    def __init__(
            self,
            name,
            env_spec,
            conv_filters, conv_filter_sizes, conv_strides, conv_pads,
            hidden_sizes=[],
            hidden_nonlinearity=NL.rectify,
            output_nonlinearity=NL.softmax,
            prob_network=None,
            feature_layer_index=-2,
            eps=0,
    ):
        """
        The policy consists of several convolution layers followed by fc layers and softmax
        :param env_spec: A spec for the mdp.
        :param conv_filters, conv_filter_sizes, conv_strides, conv_pads: specify the convolutional layers. See rllab.core.network.ConvNetwork for details.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :param feature_layer_index: index of the feature layer. Default -2 means the last layer before fc-softmax
        :param eps: mixture weight on uniform distribution; useful to force exploration
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec

        if prob_network is None:
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
                name="prob_network",
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer

        # mix in uniform distribution
        n_actions = env_spec.action_space.n
        uniform_prob = np.ones(n_actions,dtype=theano.config.floatX) / n_actions
        eps_var = theano.shared(
            eps,
            name="eps",
        )
        nn_prob = L.get_output(prob_network.output_layer)
        final_prob = (1-eps_var) * nn_prob + eps_var * uniform_prob
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            final_prob,
        )
        self._eps_var = eps_var

        self._feature_layer_index = feature_layer_index
        feature_layer = L.get_all_layers(prob_network.output_layer)[feature_layer_index] # layer before fc-softmax
        self._f_feature = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(feature_layer)
        )
        self._feature_shape = L.get_output_shape(feature_layer)[1:]

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Beispiel #42
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        split_masks=None,
        dist_cls=DiagonalGaussian,
        mp_dim=0,
        mp_sel_hid_dim=0,
        mp_sel_num=0,
        mp_projection_dim=2,
        net_mode=0,  # 0: vanilla, 1: append mp to second layer, 2: project mp to lower space, 3: mp selection blending, 4: mp selection discrete
        split_init_net=None,
        split_units=None,
        wc_net_path=None,
        learn_segment=False,
        split_num=1,
        split_layer=[0],
        split_std=False,
        task_id=0,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            if net_mode == 1:
                mean_network = MLPAppend(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    append_dim=mp_dim,
                )
            elif net_mode == 2:
                mean_network = MLP_PROJ(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_hid_dim=16,
                    mp_proj_dim=mp_projection_dim,
                )
            elif net_mode == 3:
                mean_network = MLP_PS(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_sel_hid_dim=mp_sel_hid_dim,
                    mp_sel_num=mp_sel_num,
                )
            elif net_mode == 4:
                wc_net = joblib.load(wc_net_path)
                mean_network = MLP_PSD(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_sel_hid_dim=mp_sel_hid_dim,
                    mp_sel_num=mp_sel_num,
                    wc_net=wc_net,
                    learn_segment=learn_segment,
                )
            elif net_mode == 5:
                mean_network = MLP_Split(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_layer=split_layer,
                    split_num=split_num,
                )
            elif net_mode == 6:
                mean_network = MLP_SplitAct(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    split_units=split_units,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 7:
                mean_network = MLP_SoftSplit(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 8:
                mean_network = MLP_MaskedSplit(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    split_masks=split_masks,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 9:
                mean_network = MLP_MaskedSplitCont(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    task_id=task_id,
                    init_net=split_init_net._mean_network,
                )
            else:
                mean_network = MLP(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                if net_mode != 8 or not split_std:
                    l_log_std = ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=lasagne.init.Constant(np.log(init_std)),
                        name="output_log_std",
                        trainable=learn_std,
                    )
                else:
                    l_log_std = ParamLayerSplit(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=lasagne.init.Constant(np.log(init_std)),
                        name="output_log_std",
                        trainable=learn_std,
                        split_num=split_num,
                        init_param=split_init_net.get_params()[-1])
                if net_mode == 6 or net_mode == 7 or (net_mode == 8
                                                      and not split_std):
                    l_log_std.get_params()[0].set_value(
                        split_init_net.get_params()[-1].get_value())
                if net_mode == 9:
                    l_log_std.get_params()[0].set_value(
                        split_init_net.get_params()[-1].get_value() + 0.5)

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std
        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        if net_mode == 3 or net_mode == 4:
            self._f_blendweight = ext.compile_function(
                inputs=[obs_var], outputs=[self._mean_network._blend_weights])
            entropy = -TT.mean(self._mean_network._blend_weights *
                               TT.log(self._mean_network._blend_weights))
            self._f_weightentropy = ext.compile_function(inputs=[obs_var],
                                                         outputs=[entropy])
            avg_weights = TT.mean(self._mean_network._blend_weights, axis=0)
            entropy2 = -TT.mean(avg_weights * TT.log(avg_weights))
            self._f_choiceentropy = ext.compile_function(inputs=[obs_var],
                                                         outputs=[entropy2])