Ejemplo n.º 1
0
    def __init__(
        self,
        env,
        scale_reward=1.,
        normalize_obs=False,
        normalize_reward=False,
        obs_alpha=0.001,
        reward_alpha=0.001,
        normalization_scale=1.,
        dummy_flag=False,
    ):
        Serializable.quick_init(self, locals())

        self._scale_reward = 1
        self._wrapped_env = env

        self._normalize_obs = normalize_obs
        self._normalize_reward = normalize_reward
        self._obs_alpha = obs_alpha
        self._obs_mean = np.zeros(self.observation_space.shape)
        self._obs_var = np.ones(self.observation_space.shape)
        self._reward_alpha = reward_alpha
        self._reward_mean = 0.
        self._reward_var = 1.
        self._normalization_scale = normalization_scale
        self._dummy_flag = dummy_flag
Ejemplo n.º 2
0
    def __init__(
            self,
            name,
            env,
            dynamics_model,
            reward_model=None,
            discount=1,
            use_cem=False,
            n_candidates=1024,
            horizon=10,
            num_rollouts=10,
            context=False,
    ):
        self.dynamics_model = dynamics_model
        self.reward_model = reward_model
        self.discount = discount
        self.n_candidates = n_candidates
        self.horizon = horizon
        self.use_cem = use_cem
        self.env = env
        self.context = context

        self.unwrapped_env = env
        while hasattr(self.unwrapped_env, 'wrapped_env'):
            self.unwrapped_env = self.unwrapped_env.wrapped_env

        # make sure that enc has reward function
        assert hasattr(self.unwrapped_env, 'reward'), "env must have a reward function"

        Serializable.quick_init(self, locals())
        super(MPCController, self).__init__(env=env)
Ejemplo n.º 3
0
 def __init__(self, observation_space, action_space):
     """
     :type observation_space: Space
     :type action_space: Space
     """
     Serializable.quick_init(self, locals())
     self._observation_space = observation_space
     self._action_space = action_space
Ejemplo n.º 4
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(200, 200, 200, 200),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
        batch_size=128,
        learning_rate=0.001,
        normalize_input=True,
        optimizer=tf.compat.v1.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        n_forwards=30,
        n_candidates=2500,
        ensemble_size=5,
        n_particles=20,
        use_cem=False,
        deterministic=False,
        weight_decays=(0., 0., 0., 0., 0.),
        weight_decay_coeff=0.0,
        cp_hidden_sizes=(256, 128, 64),
        context_weight_decays=(0., 0., 0., 0.),
        context_out_dim=10,
        context_hidden_nonlinearity=tf.nn.relu,
        history_length=10,
        future_length=10,
        state_diff=False,
        back_coeff=0.0,
    ):

        Serializable.quick_init(self, locals())

        # Default Attributes
        self.env = env
        self.name = name
        self._dataset = None

        # Dynamics Model Attributes
        self.deterministic = deterministic

        # MPC Attributes
        self.n_forwards = n_forwards
        self.n_candidates = n_candidates
        self.use_cem = use_cem

        # Training Attributes
        self.weight_decays = weight_decays
        self.weight_decay_coeff = weight_decay_coeff
        self.normalization = None
        self.normalize_input = normalize_input
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        # PE-TS Attributes
        self.ensemble_size = ensemble_size
        self.n_particles = n_particles

        # CaDM Attributes
        self.cp_hidden_sizes = cp_hidden_sizes
        self.context_out_dim = context_out_dim
        self.history_length = history_length
        self.future_length = future_length
        self.context_weight_decays = context_weight_decays
        self.state_diff = state_diff
        self.back_coeff = back_coeff

        # Dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.proc_obs_space_dims = proc_obs_space_dims = env.proc_observation_space_dims
        if len(env.action_space.shape) == 0:
            self.action_space_dims = action_space_dims = env.action_space.n
            self.discrete = True
        else:
            self.action_space_dims = action_space_dims = env.action_space.shape[
                0]
            self.discrete = False

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]

        with tf.compat.v1.variable_scope(name):
            # placeholders
            self.obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None,
                                                          obs_space_dims))
            self.obs_next_ph = tf.compat.v1.placeholder(tf.float32,
                                                        shape=(None,
                                                               obs_space_dims))
            self.act_ph = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None,
                                                          action_space_dims))
            self.cp_obs_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(None, obs_space_dims * self.history_length))
            self.cp_act_ph = tf.compat.v1.placeholder(
                tf.float32,
                shape=(None, action_space_dims * self.history_length))

            self.bs_obs_ph = tf.compat.v1.placeholder(tf.float32,
                                                      shape=(ensemble_size,
                                                             None,
                                                             obs_space_dims))
            self.bs_obs_next_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(ensemble_size, None, obs_space_dims))
            self.bs_act_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(ensemble_size, None, action_space_dims))
            self.bs_delta_ph = tf.compat.v1.placeholder(tf.float32,
                                                        shape=(ensemble_size,
                                                               None,
                                                               obs_space_dims))
            self.bs_back_delta_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(ensemble_size, None, obs_space_dims))
            self.bs_cp_obs_ph = tf.compat.v1.placeholder(
                tf.float32,
                shape=(ensemble_size, None,
                       obs_space_dims * self.history_length))
            self.bs_cp_act_ph = tf.compat.v1.placeholder(
                tf.float32,
                shape=(ensemble_size, None,
                       action_space_dims * self.history_length))

            self.norm_obs_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(proc_obs_space_dims, ))
            self.norm_obs_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(proc_obs_space_dims, ))
            self.norm_act_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(action_space_dims, ))
            self.norm_act_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(action_space_dims, ))
            self.norm_delta_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims, ))
            self.norm_delta_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims, ))
            self.norm_cp_obs_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims * self.history_length, ))
            self.norm_cp_obs_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims * self.history_length, ))
            self.norm_cp_act_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(action_space_dims * self.history_length, ))
            self.norm_cp_act_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(action_space_dims * self.history_length, ))
            self.norm_back_delta_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims, ))
            self.norm_back_delta_std_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(obs_space_dims, ))

            self.cem_init_mean_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(None, self.n_forwards, action_space_dims))
            self.cem_init_var_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(None, self.n_forwards, action_space_dims))

            # create MLP
            with tf.compat.v1.variable_scope('context_model'):
                cp = PureEnsembleContextPredictor(
                    name,
                    output_dim=0,
                    input_dim=0,
                    context_dim=(obs_space_dims + action_space_dims) *
                    self.history_length,
                    context_hidden_sizes=self.cp_hidden_sizes,
                    output_nonlinearity=output_nonlinearity,
                    ensemble_size=self.ensemble_size,
                    context_weight_decays=self.context_weight_decays,
                    bs_input_cp_obs_var=self.bs_cp_obs_ph,
                    bs_input_cp_act_var=self.bs_cp_act_ph,
                    norm_cp_obs_mean_var=self.norm_cp_obs_mean_ph,
                    norm_cp_obs_std_var=self.norm_cp_obs_std_ph,
                    norm_cp_act_mean_var=self.norm_cp_act_mean_ph,
                    norm_cp_act_std_var=self.norm_cp_act_std_ph,
                    context_out_dim=self.context_out_dim,
                )
                self.bs_cp_var = cp.context_output_var

            with tf.compat.v1.variable_scope('ff_model'):
                mlp = PlusCaDMEnsembleCEMMLP(
                    name,
                    # Inputs
                    input_dim=0,
                    output_dim=obs_space_dims,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    input_obs_dim=obs_space_dims,
                    input_act_dim=action_space_dims,
                    input_obs_var=self.obs_ph,
                    input_act_var=self.act_ph,
                    bs_input_obs_var=self.bs_obs_ph,
                    bs_input_act_var=self.bs_act_ph,
                    # CaDM
                    context_obs_var=self.cp_obs_ph,
                    context_act_var=self.cp_act_ph,
                    cp_forward=cp.forward,
                    bs_input_cp_var=self.bs_cp_var,
                    context_out_dim=self.context_out_dim,
                    # PE-TS
                    weight_decays=self.weight_decays,
                    deterministic=self.deterministic,
                    ensemble_size=self.ensemble_size,
                    n_particles=self.n_particles,
                    # Environments
                    obs_preproc_fn=env.obs_preproc,
                    obs_postproc_fn=env.obs_postproc,
                    reward_fn=env.tf_reward_fn(),
                    # Policy
                    n_forwards=self.n_forwards,
                    n_candidates=self.n_candidates,
                    use_cem=self.use_cem,
                    # Normalization
                    cem_init_mean_var=self.cem_init_mean_ph,
                    cem_init_var_var=self.cem_init_var_ph,
                    norm_obs_mean_var=self.norm_obs_mean_ph,
                    norm_obs_std_var=self.norm_obs_std_ph,
                    norm_act_mean_var=self.norm_act_mean_ph,
                    norm_act_std_var=self.norm_act_std_ph,
                    norm_delta_mean_var=self.norm_delta_mean_ph,
                    norm_delta_std_var=self.norm_delta_std_ph,
                    norm_cp_obs_mean_var=self.norm_cp_obs_mean_ph,
                    norm_cp_obs_std_var=self.norm_cp_obs_std_ph,
                    norm_cp_act_mean_var=self.norm_cp_act_mean_ph,
                    norm_cp_act_std_var=self.norm_cp_act_std_ph,
                    norm_back_delta_mean_var=None,
                    norm_back_delta_std_var=None,
                    # Others
                    discrete=self.discrete,
                    build_policy_graph=True,
                )

            if self.back_coeff > 0.0:
                with tf.compat.v1.variable_scope('backward_model'):
                    back_mlp = PlusCaDMEnsembleCEMMLP(
                        name,
                        # Inputs
                        input_dim=0,
                        output_dim=obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_obs_dim=obs_space_dims,
                        input_act_dim=action_space_dims,
                        input_obs_var=self.obs_next_ph,  ##
                        input_act_var=self.act_ph,
                        bs_input_obs_var=self.bs_obs_next_ph,  ##
                        bs_input_act_var=self.bs_act_ph,
                        # CaDM
                        context_obs_var=self.cp_obs_ph,
                        context_act_var=self.cp_act_ph,
                        cp_forward=None,  ##
                        bs_input_cp_var=self.bs_cp_var,
                        context_out_dim=self.context_out_dim,
                        # PE-TS
                        weight_decays=self.weight_decays,
                        deterministic=True,  ##
                        ensemble_size=self.ensemble_size,
                        n_particles=self.n_particles,
                        # Environments
                        obs_preproc_fn=env.obs_preproc,
                        obs_postproc_fn=env.obs_postproc,
                        reward_fn=env.tf_reward_fn(),
                        # Policy
                        n_forwards=self.n_forwards,
                        n_candidates=self.n_candidates,
                        use_cem=self.use_cem,
                        # Normalization
                        cem_init_mean_var=self.cem_init_mean_ph,
                        cem_init_var_var=self.cem_init_var_ph,
                        norm_obs_mean_var=self.norm_obs_mean_ph,
                        norm_obs_std_var=self.norm_obs_std_ph,
                        norm_act_mean_var=self.norm_act_mean_ph,
                        norm_act_std_var=self.norm_act_std_ph,
                        norm_delta_mean_var=None,  ##
                        norm_delta_std_var=None,  ##
                        norm_cp_obs_mean_var=self.norm_cp_obs_mean_ph,  ##
                        norm_cp_obs_std_var=self.norm_cp_obs_std_ph,  ##
                        norm_cp_act_mean_var=self.norm_cp_act_mean_ph,
                        norm_cp_act_std_var=self.norm_cp_act_std_ph,
                        norm_back_delta_mean_var=self.
                        norm_back_delta_mean_ph,  ##
                        norm_back_delta_std_var=self.
                        norm_back_delta_std_ph,  ##
                        # Others
                        discrete=self.discrete,
                        build_policy_graph=False,  ##
                    )

            self.params = tf.compat.v1.trainable_variables()
            self.delta_pred = mlp.output_var

            # 1. Forward Dynamics Prediction Loss
            # Outputs from Dynamics Model are normalized delta predictions
            mu, logvar = mlp.mu, mlp.logvar
            bs_normalized_delta = normalize(self.bs_delta_ph,
                                            self.norm_delta_mean_ph,
                                            self.norm_delta_std_ph)
            self.mse_loss = tf.reduce_sum(
                tf.reduce_mean(tf.reduce_mean(tf.square(mu -
                                                        bs_normalized_delta),
                                              axis=-1),
                               axis=-1))

            # 2. Backward Dynamics Prediction Loss
            if self.back_coeff > 0.0:
                back_mu = back_mlp.mu
                bs_normalized_back_delta = normalize(
                    self.bs_back_delta_ph, self.norm_back_delta_mean_ph,
                    self.norm_back_delta_std_ph)
                self.back_mse_loss = tf.reduce_sum(
                    tf.reduce_mean(tf.reduce_mean(
                        tf.square(back_mu - bs_normalized_back_delta),
                        axis=-1),
                                   axis=-1))

                self.back_l2_reg_loss = tf.reduce_sum(back_mlp.l2_regs)
            else:
                self.back_mse_loss = tf.constant(0.0)

            # 4. Weight Decay Regularization
            self.l2_reg_loss = tf.reduce_sum(mlp.l2_regs)
            self.context_l2_reg_loss = tf.reduce_sum(cp.l2_regs)

            l2_loss = self.l2_reg_loss + self.context_l2_reg_loss
            if self.back_coeff > 0.0:
                l2_loss += self.back_l2_reg_loss
            self.l2_loss = l2_loss

            if self.deterministic:
                recon_loss = self.mse_loss
                if self.back_coeff > 0.0:
                    recon_loss += self.back_coeff * self.back_mse_loss
                self.recon_loss = recon_loss
                self.loss = self.recon_loss + self.l2_loss * self.weight_decay_coeff
            else:
                invvar = tf.exp(-logvar)
                self.mu_loss = tf.reduce_sum(
                    tf.reduce_mean(tf.reduce_mean(
                        tf.square(mu - bs_normalized_delta) * invvar, axis=-1),
                                   axis=-1))
                self.var_loss = tf.reduce_sum(
                    tf.reduce_mean(tf.reduce_mean(logvar, axis=-1), axis=-1))
                self.reg_loss = 0.01 * tf.reduce_sum(
                    mlp.max_logvar) - 0.01 * tf.reduce_sum(mlp.min_logvar)

                recon_loss = self.mu_loss + self.var_loss
                if self.back_coeff > 0.0:
                    recon_loss += self.back_coeff * self.back_mse_loss
                self.recon_loss = recon_loss
                self.loss = self.recon_loss + self.reg_loss + self.l2_loss * self.weight_decay_coeff

            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self._get_cem_action = tensor_utils.compile_function([
                self.obs_ph, self.cp_obs_ph, self.cp_act_ph,
                self.norm_obs_mean_ph, self.norm_obs_std_ph,
                self.norm_act_mean_ph, self.norm_act_std_ph,
                self.norm_delta_mean_ph, self.norm_delta_std_ph,
                self.norm_cp_obs_mean_ph, self.norm_cp_obs_std_ph,
                self.norm_cp_act_mean_ph, self.norm_cp_act_std_ph,
                self.cem_init_mean_ph, self.cem_init_var_ph
            ], mlp.optimal_action_var)

            self._get_rs_action = tensor_utils.compile_function([
                self.obs_ph,
                self.cp_obs_ph,
                self.cp_act_ph,
                self.norm_obs_mean_ph,
                self.norm_obs_std_ph,
                self.norm_act_mean_ph,
                self.norm_act_std_ph,
                self.norm_delta_mean_ph,
                self.norm_delta_std_ph,
                self.norm_cp_obs_mean_ph,
                self.norm_cp_obs_std_ph,
                self.norm_cp_act_mean_ph,
                self.norm_cp_act_std_ph,
            ], mlp.optimal_action_var)

            self._get_context_pred = tensor_utils.compile_function([
                self.bs_cp_obs_ph, self.bs_cp_act_ph, self.norm_cp_obs_mean_ph,
                self.norm_cp_obs_std_ph, self.norm_cp_act_mean_ph,
                self.norm_cp_act_std_ph
            ], self.bs_cp_var)  ## inference cp var
Ejemplo n.º 5
0
 def __init__(self, *args, **kwargs):
     # store the init args for serialization and call the super constructors
     Serializable.quick_init(self, locals())
     Layer.__init__(self, *args, **kwargs)
     self.build_graph()
Ejemplo n.º 6
0
 def __setstate__(self, d):
     Serializable.__setstate__(self, d)
     self._obs_mean = d["_obs_mean"]
     self._obs_var = d["_obs_var"]
Ejemplo n.º 7
0
 def __getstate__(self):
     d = Serializable.__getstate__(self)
     d["_obs_mean"] = self._obs_mean
     d["_obs_var"] = self._obs_var
     return d
Ejemplo n.º 8
0
    def __init__(self,
                 name,
                 env,
                 hidden_sizes=(200, 200, 200, 200),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 batch_size=128,
                 learning_rate=0.001,
                 normalize_input=True,
                 optimizer=tf.compat.v1.train.AdamOptimizer,
                 valid_split_ratio=0.2,
                 rolling_average_persitency=0.99,
                 n_forwards=30,
                 n_candidates=2500,
                 ensemble_size=5,
                 n_particles=20,
                 use_cem=False,
                 deterministic=False,
                 weight_decays=(0., 0., 0., 0., 0.),
                 weight_decay_coeff=0.0,
                 ):

        Serializable.quick_init(self, locals())

        # Default Attributes
        self.env = env
        self.name = name
        self._dataset = None

        # Dynamics Model Attributes
        self.deterministic = deterministic

        # MPC Attributes
        self.n_forwards = n_forwards
        self.n_candidates = n_candidates
        self.use_cem = use_cem

        # Training Attributes
        self.weight_decays = weight_decays
        self.weight_decay_coeff = weight_decay_coeff
        self.normalization = None
        self.normalize_input = normalize_input
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        # PE-TS Attributes
        self.ensemble_size = ensemble_size
        self.n_particles = n_particles

        # Dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.proc_obs_space_dims = proc_obs_space_dims = env.proc_observation_space_dims
        if len(env.action_space.shape) == 0:
            self.action_space_dims = action_space_dims = env.action_space.n
            self.discrete = True
        else:
            self.action_space_dims = action_space_dims = env.action_space.shape[0]
            self.discrete = False

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]

        with tf.compat.v1.variable_scope(name):
            # placeholders
            self.obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_space_dims))
            self.act_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, action_space_dims))
            self.delta_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_space_dims))

            self.bs_obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(ensemble_size, None, obs_space_dims))
            self.bs_act_ph = tf.compat.v1.placeholder(tf.float32, shape=(ensemble_size, None, action_space_dims))
            self.bs_delta_ph = tf.compat.v1.placeholder(tf.float32, shape=(ensemble_size, None, obs_space_dims))

            self.norm_obs_mean_ph = tf.compat.v1.placeholder(tf.float32, shape=(proc_obs_space_dims,))
            self.norm_obs_std_ph = tf.compat.v1.placeholder(tf.float32, shape=(proc_obs_space_dims,))
            self.norm_act_mean_ph = tf.compat.v1.placeholder(tf.float32, shape=(action_space_dims,))
            self.norm_act_std_ph = tf.compat.v1.placeholder(tf.float32, shape=(action_space_dims,))
            self.norm_delta_mean_ph = tf.compat.v1.placeholder(tf.float32, shape=(obs_space_dims,))
            self.norm_delta_std_ph = tf.compat.v1.placeholder(tf.float32, shape=(obs_space_dims,))

            self.cem_init_mean_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, self.n_forwards, action_space_dims))
            self.cem_init_var_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, self.n_forwards, action_space_dims))

            # create MLP
            with tf.compat.v1.variable_scope('ff_model'):
                mlp = PlusEnsembleCEMMLP(name,
                                        input_dim=0,
                                        output_dim=obs_space_dims,
                                        hidden_sizes=hidden_sizes,
                                        hidden_nonlinearity=hidden_nonlinearity,
                                        output_nonlinearity=output_nonlinearity,
                                        input_obs_dim=obs_space_dims,
                                        input_act_dim=action_space_dims,
                                        input_obs_var=self.obs_ph,
                                        input_act_var=self.act_ph,
                                        n_forwards=self.n_forwards,
                                        reward_fn=env.tf_reward_fn(),
                                        n_candidates=self.n_candidates,
                                        discrete=self.discrete,
                                        bs_input_obs_var=self.bs_obs_ph,
                                        bs_input_act_var=self.bs_act_ph,
                                        ensemble_size=self.ensemble_size,
                                        n_particles=self.n_particles,
                                        norm_obs_mean_var=self.norm_obs_mean_ph,
                                        norm_obs_std_var=self.norm_obs_std_ph,
                                        norm_act_mean_var=self.norm_act_mean_ph,
                                        norm_act_std_var=self.norm_act_std_ph,
                                        norm_delta_mean_var=self.norm_delta_mean_ph,
                                        norm_delta_std_var=self.norm_delta_std_ph,
                                        obs_preproc_fn=env.obs_preproc,
                                        obs_postproc_fn=env.obs_postproc,
                                        use_cem=self.use_cem,
                                        cem_init_mean_var=self.cem_init_mean_ph,
                                        cem_init_var_var=self.cem_init_var_ph,
                                        deterministic=self.deterministic,
                                        weight_decays=self.weight_decays,
                                        build_policy_graph=True,
                                        )

                self.params = tf.compat.v1.trainable_variables()
            self.delta_pred = mlp.output_var 

            # Outputs from Dynamics Model are normalized delta predictions
            mu, logvar = mlp.mu, mlp.logvar
            bs_normalized_delta = normalize(self.bs_delta_ph, self.norm_delta_mean_ph, self.norm_delta_std_ph)
            
            self.mse_loss = tf.reduce_sum(
                tf.reduce_mean(tf.reduce_mean(tf.square(mu - bs_normalized_delta), axis=-1), axis=-1))
            self.l2_reg_loss = tf.reduce_sum(mlp.l2_regs) 

            if self.deterministic:
                self.recon_loss = self.mse_loss
                self.loss = self.mse_loss + self.l2_reg_loss * self.weight_decay_coeff
            else:
                invvar = tf.exp(-logvar)
                self.mu_loss = tf.reduce_sum(
                    tf.reduce_mean(tf.reduce_mean(tf.square(mu - bs_normalized_delta) * invvar, axis=-1), axis=-1))
                self.var_loss = tf.reduce_sum(
                    tf.reduce_mean(tf.reduce_mean(logvar, axis=-1), axis=-1))
                self.recon_loss = self.mu_loss + self.var_loss
                self.reg_loss = 0.01 * tf.reduce_sum(mlp.max_logvar) - 0.01 * tf.reduce_sum(mlp.min_logvar)
                self.loss = self.recon_loss + self.reg_loss + self.l2_reg_loss * self.weight_decay_coeff

            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self._get_cem_action = tensor_utils.compile_function([self.obs_ph,
                                                                  self.norm_obs_mean_ph, self.norm_obs_std_ph,
                                                                  self.norm_act_mean_ph, self.norm_act_std_ph,
                                                                  self.norm_delta_mean_ph, self.norm_delta_std_ph,
                                                                  self.cem_init_mean_ph, self.cem_init_var_ph],
                                                                  mlp.optimal_action_var)
            self._get_rs_action = tensor_utils.compile_function([self.obs_ph,
                                                                  self.norm_obs_mean_ph, self.norm_obs_std_ph,
                                                                  self.norm_act_mean_ph, self.norm_act_std_ph,
                                                                  self.norm_delta_mean_ph, self.norm_delta_std_ph],
                                                                  mlp.optimal_action_var)

            self._get_pred = tensor_utils.compile_function([self.bs_obs_ph, self.bs_act_ph,
                                                            self.norm_obs_mean_ph, self.norm_obs_std_ph,
                                                            self.norm_act_mean_ph, self.norm_act_std_ph,
                                                            self.norm_delta_mean_ph, self.norm_delta_std_ph],
                                                            [mlp.mu, mlp.logvar])
Ejemplo n.º 9
0
 def __init__(self, env):
     Serializable.quick_init(self, locals())
     self.env = env
     while hasattr(self.env, 'wrapped_env'):
         self.env = self.env.wrapped_env