Esempio n. 1
0
 def distribution_info_sym(self, obs_var, act_var):
     means = []
     log_stds = []
     with tf.variable_scope(self.name, reuse=True):
         obs_var = tf.split(obs_var, self.num_models, axis=0)
         act_var = tf.split(act_var, self.num_models, axis=0)
         for i in range(self.num_models):
             with tf.variable_scope('model_{}'.format(i), reuse=True):
                 in_obs_var = (obs_var[i] - self._mean_obs_var[i]) / (
                     self._std_obs_var[i] + 1e-8)
                 in_act_var = (act_var[i] - self._mean_act_var[i]) / (
                     self._std_act_var[i] + 1e-8)
                 input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                 mlp = MLP(
                     self.name + '/model_{}'.format(i),
                     output_dim=self.obs_space_dims,
                     hidden_sizes=self.hidden_sizes,
                     hidden_nonlinearity=self.hidden_nonlinearity,
                     output_nonlinearity=self.output_nonlinearity,
                     input_var=input_var,
                     input_dim=self.obs_space_dims + self.action_space_dims,
                 )
                 mean = mlp.output_var * self._std_delta_var[
                     i] + self._mean_delta_var[i] + obs_var[i]
                 log_std = tf.tile(
                     tf.expand_dims(tf.log(self._std_delta_var[i]), axis=0),
                     [tf.shape(in_obs_var)[0], 1])
             means.append(mean)
             log_stds.append(log_std)
     mean = tf.concat(means, axis=0)
     log_std = tf.concat(log_stds, axis=0)
     return dict(mean=mean, log_std=log_std)
Esempio n. 2
0
 def distribution_info_sym(self, obs_var):
     with tf.variable_scope(self.name + '/value_function', reuse=True):
         input_var = (obs_var - self._mean_input_var)/(self._std_input_var + 1e-8)
         mlp = MLP(self.name,
                   output_dim=1,
                   hidden_sizes=self.hidden_sizes,
                   hidden_nonlinearity=self.hidden_nonlinearity,
                   output_nonlinearity=self.output_nonlinearity,
                   input_var=input_var,
                   input_dim=self.obs_space_dims)
         output_var = tf.reshape(mlp.output_var, shape=(-1,))
         output_var = output_var * self._std_output_var + self._mean_output_var
     return dict(mean=output_var)
Esempio n. 3
0
    def predict_batches_sym(self, obs_ph, act_ph):
        """
        Same batch fed into all models. Randomly output one of the predictions for each observation.
        :param obs_ph: (batch_size, obs_space_dims)
        :param act_ph: (batch_size, act_space_dims)
        :return: (batch_size, obs_space_dims)
        """
        original_obs = obs_ph
        # shuffle
        obs_ph, act_ph = tf.split(obs_ph, self.num_models,
                                  axis=0), tf.split(act_ph,
                                                    self.num_models,
                                                    axis=0)

        delta_preds = []
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    assert self.normalize_input
                    in_obs_var = (obs_ph[i] - self._mean_obs_var[i]) / (
                        self._std_obs_var[i] + 1e-8)
                    in_act_var = (act_ph[i] - self._mean_act_var[i]) / (
                        self._std_act_var[i] + 1e-8)
                    input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                    mlp = MLP(
                        self.name + '/model_{}'.format(i),
                        output_dim=2 * self.obs_space_dims,
                        hidden_sizes=self.hidden_sizes,
                        hidden_nonlinearity=self.hidden_nonlinearity,
                        output_nonlinearity=self.output_nonlinearity,
                        input_var=input_var,
                        input_dim=self.obs_space_dims + self.action_space_dims,
                    )

                    mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                    logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                              logvar)
                    logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                              self.min_logvar)
                    delta_pred = mean + tf.random.normal(
                        shape=tf.shape(mean)) * tf.exp(logvar)
                    # denormalize
                    delta_pred = delta_pred * self._std_delta_var[
                        i] + self._mean_delta_var[i]
                    delta_preds.append(delta_pred)

        delta_preds = tf.concat(delta_preds, axis=0)
        # pred_obs = tf.clip_by_value(original_obs + delta_preds, -1e2, 1e2)
        pred_obs = original_obs + delta_preds

        return pred_obs
Esempio n. 4
0
    def predict_sym(self, obs_ph, act_ph):
        """
        Same batch fed into all models. Randomly output one of the predictions for each observation.
        :param obs_ph: (batch_size, obs_space_dims)
        :param act_ph: (batch_size, act_space_dims)
        :return: (batch_size, obs_space_dims)
        """
        original_obs = obs_ph

        # shuffle
        perm = tf.range(0, limit=tf.shape(obs_ph)[0], dtype=tf.int32)
        perm = tf.random.shuffle(perm)
        obs_ph, act_ph = tf.gather(obs_ph, perm), tf.gather(act_ph, perm)
        obs_ph, act_ph = tf.split(obs_ph, self.num_models,
                                  axis=0), tf.split(act_ph,
                                                    self.num_models,
                                                    axis=0)

        delta_preds = []
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    assert self.normalize_input
                    in_obs_var = (obs_ph[i] - self._mean_obs_var[i]) / (
                        self._std_obs_var[i] + 1e-8)
                    in_act_var = (act_ph[i] - self._mean_act_var[i]) / (
                        self._std_act_var[i] + 1e-8)
                    input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                    mlp = MLP(
                        self.name + '/model_{}'.format(i),
                        output_dim=self.obs_space_dims,
                        hidden_sizes=self.hidden_sizes,
                        hidden_nonlinearity=self.hidden_nonlinearity,
                        output_nonlinearity=self.output_nonlinearity,
                        input_var=input_var,
                        input_dim=self.obs_space_dims + self.action_space_dims,
                    )
                    # denormalize delta_pred
                    delta_pred = mlp.output_var * self._std_delta_var[
                        i] + self._mean_delta_var[i]
                    delta_preds.append(delta_pred)

        delta_preds = tf.concat(delta_preds, axis=0)

        # unshuffle
        perm_inv = tf.invert_permutation(perm)
        # nex_obs = clip(obs + delta_pred)
        next_obs = original_obs + tf.gather(delta_preds, perm_inv)
        next_obs = tf.clip_by_value(next_obs, -1e2, 1e2)
        return next_obs
Esempio n. 5
0
 def distribution_info_sym(self, obs_var, act_var):
     with tf.variable_scope(self.name, reuse=True):
         # st()
         in_obs_var = (obs_var - self._mean_obs_var) / (self._std_obs_var +
                                                        1e-8)
         in_act_var = (act_var - self._mean_act_var) / (self._std_act_var +
                                                        1e-8)
         input_var = tf.concat([in_obs_var, in_act_var], axis=1)
         mlp = MLP(
             self.name,
             output_dim=self.obs_space_dims,
             hidden_sizes=self.hidden_sizes,
             hidden_nonlinearity=self.hidden_nonlinearity,
             output_nonlinearity=self.output_nonlinearity,
             input_var=input_var,
             input_dim=self.obs_space_dims + self.action_space_dims,
         )
         mean = mlp.output_var * self._std_delta_var + self._mean_delta_var + obs_var
         log_std = tf.log(self._std_delta_var)
     return dict(mean=mean, log_std=log_std)
Esempio n. 6
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(500, 500),
        hidden_nonlinearity="tanh",
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=True,
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        buffer_size=100000,
    ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.use_reward_model = False
        self.buffer_size = buffer_size
        self.name = name
        self.hidden_sizes = hidden_sizes

        self._dataset_train = None
        self._dataset_test = None
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency
        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]

        with tf.variable_scope(name):
            self.batch_size = batch_size
            self.learning_rate = learning_rate

            # determine dimensionality of state and action space
            self.obs_space_dims = env.observation_space.shape[0]
            self.action_space_dims = env.action_space.shape[0]

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlp = MLP(name,
                      output_dim=self.obs_space_dims,
                      hidden_sizes=hidden_sizes,
                      hidden_nonlinearity=hidden_nonlinearity,
                      output_nonlinearity=output_nonlinearity,
                      input_var=self.nn_input,
                      input_dim=self.obs_space_dims + self.action_space_dims,
                      weight_normalization=weight_normalization)

            self.delta_pred = mlp.output_var

            # define loss and train_op
            self.loss = tf.reduce_mean(
                tf.linalg.norm(self.delta_ph - self.delta_pred, axis=-1))
            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)

        self._networks = [mlp]
Esempio n. 7
0
    def predict_sym_all(self, obs_ph, act_ph, reg_str=None, pred_type='all'):
        """
        Same batch fed into all models. Randomly output one of the predictions for each observation.
        :param obs_ph: (batch_size, obs_space_dims)
        :param act_ph: (batch_size, act_space_dims)
        :return: (batch_size, obs_space_dims)
        """
        original_obs = obs_ph

        if pred_type == 'all':
            obs_ph = tf.split(obs_ph, self.num_models, axis=0)
            act_ph = tf.split(act_ph, self.num_models, axis=0)

        delta_preds = []
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    assert self.normalize_input
                    obs = obs_ph[i] if pred_type == 'all' else obs_ph
                    act = act_ph[i] if pred_type == 'all' else act_ph
                    in_obs_var = (obs - self._mean_obs_var[i]) / (
                        self._std_obs_var[i] + 1e-8)
                    in_act_var = (act - self._mean_act_var[i]) / (
                        self._std_act_var[i] + 1e-8)
                    input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                    mlp = MLP(
                        self.name + '/model_{}'.format(i),
                        output_dim=2 * self.obs_space_dims,
                        hidden_sizes=self.hidden_sizes,
                        hidden_nonlinearity=self.hidden_nonlinearity,
                        output_nonlinearity=self.output_nonlinearity,
                        input_var=input_var,
                        input_dim=self.obs_space_dims + self.action_space_dims,
                    )

                    mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                    logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                              logvar)
                    logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                              self.min_logvar)
                    delta_pred = mean + tf.random.normal(
                        shape=tf.shape(mean)) * tf.exp(logvar)
                    # denormalize
                    delta_pred = delta_pred * self._std_delta_var[
                        i] + self._mean_delta_var[i]
                    delta_preds.append(delta_pred)

        # delta_preds = [(batch_size_per_model, obs_dims)] * num_models
        reg = 0
        if pred_type == 'all':
            if reg_str == 'uncertainty':
                reg = tf.math.reduce_variance(tf.stack(delta_preds, axis=-1),
                                              axis=-1)
                reg = tf.reduce_sum(reg, axis=1)  # (batch_size_per_model,)
                assert len(reg.get_shape()) == 1

            delta_preds = tf.concat(
                delta_preds,
                axis=0)  # (batch_size_per_model*num_models, obs_dims)
            pred_obs = original_obs + delta_preds
        else:
            delta_preds = tf.stack(
                delta_preds, axis=-1)  # (batch_size, obs_dims, num_models)
            if reg_str == 'uncertainty':
                reg = tf.math.reduce_variance(delta_preds, axis=-1)
                reg = tf.reduce_sum(reg, axis=1)
                assert len(reg.get_shape()) == 1

            pred_obs = tf.expand_dims(original_obs, axis=-1) + delta_preds
            if pred_type == 'mean':
                pred_obs = tf.reduce_mean(pred_obs, axis=-1)
            elif pred_type == 'rand':
                idx = tf.random.uniform(shape=(tf.shape(pred_obs)[0], ),
                                        minval=0,
                                        maxval=self.num_models,
                                        dtype=tf.int32)
                pred_obs = tf.batch_gather(tf.transpose(pred_obs, (0, 2, 1)),
                                           tf.reshape(idx, [-1, 1]))
                pred_obs = tf.squeeze(pred_obs, axis=1)
            else:
                raise NotImplementedError

        return pred_obs, reg
Esempio n. 8
0
    def __init__(
        self,
        name,
        env,
        num_models=5,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        early_stopping=0,
        buffer_size=50000,
    ):

        Serializable.quick_init(self, locals())

        max_logvar = .5
        min_logvar = -10

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio))
        self.buffer_size_test = int(buffer_size * valid_split_ratio)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_models = num_models
        self.name = name
        self.hidden_sizes = hidden_sizes
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        self.timesteps_counter = 0
        self.used_timesteps_counter = 0

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity
        self.early_stopping = early_stopping
        """ computation graph for training and simple inference """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            self._create_stats_vars()

            self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          max_logvar,
                                          dtype=tf.float32,
                                          trainable=True,
                                          name="max_logvar")
            self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          min_logvar,
                                          dtype=tf.float32,
                                          trainable=True,
                                          name="min_logvar")
            self._create_assign_ph()

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            obs_ph = tf.split(self.nn_input, self.num_models, axis=0)

            # create MLP
            mlps = []
            delta_preds = []
            var_preds = []
            logvar_preds = []
            invar_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i)):
                    mlp = MLP(
                        name + '/model_{}'.format(i),
                        output_dim=2 * obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_var=obs_ph[i],
                        input_dim=obs_space_dims +
                        action_space_dims,  # FIXME: input weight_normalization?
                    )
                    mlps.append(mlp)

                mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                          logvar)
                logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                          self.min_logvar)
                var = tf.exp(logvar)
                inv_var = tf.exp(-logvar)

                delta_preds.append(mean)
                logvar_preds.append(logvar)
                var_preds.append(var)
                invar_preds.append(inv_var)

            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.logvar_pred = tf.stack(
                logvar_preds,
                axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.var_pred = tf.stack(
                var_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.invar_pred = tf.stack(
                invar_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            self.loss = tf.reduce_mean(
                tf.square(self.delta_ph[:, :, None] - self.delta_pred) *
                self.invar_pred + self.logvar_pred)
            self.loss += 0.01 * tf.reduce_mean(
                self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar)
            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
            self.f_var_pred = compile_function([self.obs_ph, self.act_ph],
                                               self.var_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            var_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name + '/model_{}'.format(i),
                              output_dim=2 * obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=nn_input,
                              input_dim=obs_space_dims + action_space_dims,
                              weight_normalization=weight_normalization)

                mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                          logvar)
                logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                          self.min_logvar)
                var = tf.exp(logvar)
                inv_var = tf.exp(-logvar)

                loss = tf.reduce_mean(
                    tf.square(self.delta_model_batches[i] - mean) * inv_var +
                    logvar)
                loss += (0.01 * tf.reduce_mean(self.max_logvar) -
                         0.01 * tf.reduce_mean(self.min_logvar))

                delta_preds.append(mean)
                var_preds.append(var)
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(learning_rate=self.learning_rate).minimize(loss))

            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)
            self.var_pred_model_batches_stack = tf.concat(var_preds, axis=0)

            # tensor_utils
            self.f_delta_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

            self.f_var_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.var_pred_model_batches_stack)

        self._networks = mlps
Esempio n. 9
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        buffer_size=50000,
    ):

        Serializable.quick_init(self, locals())

        max_logvar = .0
        min_logvar = -10

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]

        self.hidden_nonlinearity = self._activations[hidden_nonlinearity]
        self.output_nonlinearity = self._activations[output_nonlinearity]
        self.hidden_sizes = hidden_sizes
        """ computation graph for training and simple inference """
        with tf.variable_scope(name):
            self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          max_logvar,
                                          dtype=tf.float32,
                                          name="max_logvar")
            self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          min_logvar,
                                          dtype=tf.float32,
                                          name="min_logvar")

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            delta_preds = []
            var_preds = []
            self.obs_next_pred = []
            with tf.variable_scope('dynamics_model'):
                mlp = MLP(
                    name,
                    output_dim=2 * obs_space_dims,
                    hidden_sizes=self.hidden_sizes,
                    hidden_nonlinearity=self.hidden_nonlinearity,
                    output_nonlinearity=self.output_nonlinearity,
                    input_var=self.nn_input,
                    input_dim=obs_space_dims + action_space_dims,
                )

            mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
            logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar)
            logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar)
            var = tf.exp(logvar)

            self.delta_pred = mean
            self.var_pred = var

            # define loss and train_op
            self.loss = tf.reduce_mean((self.delta_ph - self.delta_pred)**2 /
                                       self.var_pred + tf.log(self.var_pred))
            self.loss += 0.01 * tf.reduce_mean(
                self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar)
            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
            self.f_var_pred = compile_function([self.obs_ph, self.act_ph],
                                               self.var_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        self._networks = [mlp]
Esempio n. 10
0
    def __init__(
        self,
        name,
        env,
        num_models=5,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,  # 0.1
        rolling_average_persitency=0.99,
        buffer_size=50000,
        loss_str='MSE',
    ):

        Serializable.quick_init(self, locals())

        max_logvar = 1
        min_logvar = 0.1

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio))
        self.buffer_size_test = int(buffer_size * valid_split_ratio)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_models = num_models
        self.hidden_sizes = hidden_sizes
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        self.timesteps_counter = 0
        self.used_timesteps_counter = 0

        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]
        """ computation graph for training and simple inference """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            obs_ph = tf.split(self.nn_input, self.num_models, axis=0)

            # create MLP
            mlps = []
            delta_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    mlp = MLP(
                        name + '/model_{}'.format(i),
                        output_dim=obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_var=obs_ph[i],
                        input_dim=obs_space_dims + action_space_dims,
                    )
                    mlps.append(mlp)

                delta_preds.append(mlp.output_var)

            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            if loss_str == 'L2':
                self.loss = tf.reduce_mean(
                    tf.linalg.norm(self.delta_ph[:, :, None] - self.delta_pred,
                                   axis=1))
            elif loss_str == 'MSE':
                self.loss = tf.reduce_mean(
                    (self.delta_ph[:, :, None] - self.delta_pred)**2)
            else:
                raise NotImplementedError

            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=True):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name + '/model_{}'.format(i),
                              output_dim=obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=nn_input,
                              input_dim=obs_space_dims + action_space_dims,
                              weight_normalization=weight_normalization)

                delta_preds.append(mlp.output_var)
                if loss_str == 'L2':
                    loss = tf.reduce_mean(
                        tf.linalg.norm(self.delta_model_batches[i] -
                                       mlp.output_var,
                                       axis=1))
                elif loss_str == 'MSE':
                    loss = tf.reduce_mean(
                        (self.delta_model_batches[i] - mlp.output_var)**2)
                else:
                    raise NotImplementedError
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(learning_rate=self.learning_rate).minimize(loss))
            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)

            # tensor_utils
            self.f_delta_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

        self._networks = mlps