def distribution_info_sym(self, obs_var, act_var): means = [] log_stds = [] with tf.variable_scope(self.name, reuse=True): obs_var = tf.split(obs_var, self.num_models, axis=0) act_var = tf.split(act_var, self.num_models, axis=0) for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): in_obs_var = (obs_var[i] - self._mean_obs_var[i]) / ( self._std_obs_var[i] + 1e-8) in_act_var = (act_var[i] - self._mean_act_var[i]) / ( self._std_act_var[i] + 1e-8) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) mean = mlp.output_var * self._std_delta_var[ i] + self._mean_delta_var[i] + obs_var[i] log_std = tf.tile( tf.expand_dims(tf.log(self._std_delta_var[i]), axis=0), [tf.shape(in_obs_var)[0], 1]) means.append(mean) log_stds.append(log_std) mean = tf.concat(means, axis=0) log_std = tf.concat(log_stds, axis=0) return dict(mean=mean, log_std=log_std)
def distribution_info_sym(self, obs_var): with tf.variable_scope(self.name + '/value_function', reuse=True): input_var = (obs_var - self._mean_input_var)/(self._std_input_var + 1e-8) mlp = MLP(self.name, output_dim=1, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims) output_var = tf.reshape(mlp.output_var, shape=(-1,)) output_var = output_var * self._std_output_var + self._mean_output_var return dict(mean=output_var)
def predict_batches_sym(self, obs_ph, act_ph): """ Same batch fed into all models. Randomly output one of the predictions for each observation. :param obs_ph: (batch_size, obs_space_dims) :param act_ph: (batch_size, act_space_dims) :return: (batch_size, obs_space_dims) """ original_obs = obs_ph # shuffle obs_ph, act_ph = tf.split(obs_ph, self.num_models, axis=0), tf.split(act_ph, self.num_models, axis=0) delta_preds = [] with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): assert self.normalize_input in_obs_var = (obs_ph[i] - self._mean_obs_var[i]) / ( self._std_obs_var[i] + 1e-8) in_act_var = (act_ph[i] - self._mean_act_var[i]) / ( self._std_act_var[i] + 1e-8) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=2 * self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) mean, logvar = tf.split(mlp.output_var, 2, axis=-1) logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar) logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) delta_pred = mean + tf.random.normal( shape=tf.shape(mean)) * tf.exp(logvar) # denormalize delta_pred = delta_pred * self._std_delta_var[ i] + self._mean_delta_var[i] delta_preds.append(delta_pred) delta_preds = tf.concat(delta_preds, axis=0) # pred_obs = tf.clip_by_value(original_obs + delta_preds, -1e2, 1e2) pred_obs = original_obs + delta_preds return pred_obs
def predict_sym(self, obs_ph, act_ph): """ Same batch fed into all models. Randomly output one of the predictions for each observation. :param obs_ph: (batch_size, obs_space_dims) :param act_ph: (batch_size, act_space_dims) :return: (batch_size, obs_space_dims) """ original_obs = obs_ph # shuffle perm = tf.range(0, limit=tf.shape(obs_ph)[0], dtype=tf.int32) perm = tf.random.shuffle(perm) obs_ph, act_ph = tf.gather(obs_ph, perm), tf.gather(act_ph, perm) obs_ph, act_ph = tf.split(obs_ph, self.num_models, axis=0), tf.split(act_ph, self.num_models, axis=0) delta_preds = [] with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): assert self.normalize_input in_obs_var = (obs_ph[i] - self._mean_obs_var[i]) / ( self._std_obs_var[i] + 1e-8) in_act_var = (act_ph[i] - self._mean_act_var[i]) / ( self._std_act_var[i] + 1e-8) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) # denormalize delta_pred delta_pred = mlp.output_var * self._std_delta_var[ i] + self._mean_delta_var[i] delta_preds.append(delta_pred) delta_preds = tf.concat(delta_preds, axis=0) # unshuffle perm_inv = tf.invert_permutation(perm) # nex_obs = clip(obs + delta_pred) next_obs = original_obs + tf.gather(delta_preds, perm_inv) next_obs = tf.clip_by_value(next_obs, -1e2, 1e2) return next_obs
def distribution_info_sym(self, obs_var, act_var): with tf.variable_scope(self.name, reuse=True): # st() in_obs_var = (obs_var - self._mean_obs_var) / (self._std_obs_var + 1e-8) in_act_var = (act_var - self._mean_act_var) / (self._std_act_var + 1e-8) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name, output_dim=self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) mean = mlp.output_var * self._std_delta_var + self._mean_delta_var + obs_var log_std = tf.log(self._std_delta_var) return dict(mean=mean, log_std=log_std)
def __init__( self, name, env, hidden_sizes=(500, 500), hidden_nonlinearity="tanh", output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=True, normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, buffer_size=100000, ): Serializable.quick_init(self, locals()) self.normalization = None self.normalize_input = normalize_input self.use_reward_model = False self.buffer_size = buffer_size self.name = name self.hidden_sizes = hidden_sizes self._dataset_train = None self._dataset_test = None self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.hidden_nonlinearity = hidden_nonlinearity = self._activations[ hidden_nonlinearity] self.output_nonlinearity = output_nonlinearity = self._activations[ output_nonlinearity] with tf.variable_scope(name): self.batch_size = batch_size self.learning_rate = learning_rate # determine dimensionality of state and action space self.obs_space_dims = env.observation_space.shape[0] self.action_space_dims = env.action_space.shape[0] # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, self.action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, self.obs_space_dims)) self._create_stats_vars() # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP mlp = MLP(name, output_dim=self.obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=self.nn_input, input_dim=self.obs_space_dims + self.action_space_dims, weight_normalization=weight_normalization) self.delta_pred = mlp.output_var # define loss and train_op self.loss = tf.reduce_mean( tf.linalg.norm(self.delta_ph - self.delta_pred, axis=-1)) self.optimizer = optimizer(self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) self._networks = [mlp]
def predict_sym_all(self, obs_ph, act_ph, reg_str=None, pred_type='all'): """ Same batch fed into all models. Randomly output one of the predictions for each observation. :param obs_ph: (batch_size, obs_space_dims) :param act_ph: (batch_size, act_space_dims) :return: (batch_size, obs_space_dims) """ original_obs = obs_ph if pred_type == 'all': obs_ph = tf.split(obs_ph, self.num_models, axis=0) act_ph = tf.split(act_ph, self.num_models, axis=0) delta_preds = [] with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): for i in range(self.num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): assert self.normalize_input obs = obs_ph[i] if pred_type == 'all' else obs_ph act = act_ph[i] if pred_type == 'all' else act_ph in_obs_var = (obs - self._mean_obs_var[i]) / ( self._std_obs_var[i] + 1e-8) in_act_var = (act - self._mean_act_var[i]) / ( self._std_act_var[i] + 1e-8) input_var = tf.concat([in_obs_var, in_act_var], axis=1) mlp = MLP( self.name + '/model_{}'.format(i), output_dim=2 * self.obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=input_var, input_dim=self.obs_space_dims + self.action_space_dims, ) mean, logvar = tf.split(mlp.output_var, 2, axis=-1) logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar) logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) delta_pred = mean + tf.random.normal( shape=tf.shape(mean)) * tf.exp(logvar) # denormalize delta_pred = delta_pred * self._std_delta_var[ i] + self._mean_delta_var[i] delta_preds.append(delta_pred) # delta_preds = [(batch_size_per_model, obs_dims)] * num_models reg = 0 if pred_type == 'all': if reg_str == 'uncertainty': reg = tf.math.reduce_variance(tf.stack(delta_preds, axis=-1), axis=-1) reg = tf.reduce_sum(reg, axis=1) # (batch_size_per_model,) assert len(reg.get_shape()) == 1 delta_preds = tf.concat( delta_preds, axis=0) # (batch_size_per_model*num_models, obs_dims) pred_obs = original_obs + delta_preds else: delta_preds = tf.stack( delta_preds, axis=-1) # (batch_size, obs_dims, num_models) if reg_str == 'uncertainty': reg = tf.math.reduce_variance(delta_preds, axis=-1) reg = tf.reduce_sum(reg, axis=1) assert len(reg.get_shape()) == 1 pred_obs = tf.expand_dims(original_obs, axis=-1) + delta_preds if pred_type == 'mean': pred_obs = tf.reduce_mean(pred_obs, axis=-1) elif pred_type == 'rand': idx = tf.random.uniform(shape=(tf.shape(pred_obs)[0], ), minval=0, maxval=self.num_models, dtype=tf.int32) pred_obs = tf.batch_gather(tf.transpose(pred_obs, (0, 2, 1)), tf.reshape(idx, [-1, 1])) pred_obs = tf.squeeze(pred_obs, axis=1) else: raise NotImplementedError return pred_obs, reg
def __init__( self, name, env, num_models=5, hidden_sizes=(512, 512), hidden_nonlinearity='swish', output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=False, # Doesn't work normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, early_stopping=0, buffer_size=50000, ): Serializable.quick_init(self, locals()) max_logvar = .5 min_logvar = -10 self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio)) self.buffer_size_test = int(buffer_size * valid_split_ratio) self.batch_size = batch_size self.learning_rate = learning_rate self.num_models = num_models self.name = name self.hidden_sizes = hidden_sizes self._dataset_train = None self._dataset_test = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] self.timesteps_counter = 0 self.used_timesteps_counter = 0 hidden_nonlinearity = self._activations[hidden_nonlinearity] output_nonlinearity = self._activations[output_nonlinearity] self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self.early_stopping = early_stopping """ computation graph for training and simple inference """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): self._create_stats_vars() self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) * max_logvar, dtype=tf.float32, trainable=True, name="max_logvar") self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) * min_logvar, dtype=tf.float32, trainable=True, name="min_logvar") self._create_assign_ph() # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) obs_ph = tf.split(self.nn_input, self.num_models, axis=0) # create MLP mlps = [] delta_preds = [] var_preds = [] logvar_preds = [] invar_preds = [] self.obs_next_pred = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i)): mlp = MLP( name + '/model_{}'.format(i), output_dim=2 * obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=obs_ph[i], input_dim=obs_space_dims + action_space_dims, # FIXME: input weight_normalization? ) mlps.append(mlp) mean, logvar = tf.split(mlp.output_var, 2, axis=-1) logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar) logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) var = tf.exp(logvar) inv_var = tf.exp(-logvar) delta_preds.append(mean) logvar_preds.append(logvar) var_preds.append(var) invar_preds.append(inv_var) self.delta_pred = tf.stack( delta_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) self.logvar_pred = tf.stack( logvar_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) self.var_pred = tf.stack( var_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) self.invar_pred = tf.stack( invar_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) # define loss and train_op self.loss = tf.reduce_mean( tf.square(self.delta_ph[:, :, None] - self.delta_pred) * self.invar_pred + self.logvar_pred) self.loss += 0.01 * tf.reduce_mean( self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar) self.optimizer = optimizer(learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) self.f_var_pred = compile_function([self.obs_ph, self.act_ph], self.var_pred) """ computation graph for inference where each of the models receives a different batch""" with tf.variable_scope(name, reuse=tf.AUTO_REUSE): # placeholders self.obs_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) self.act_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, action_space_dims)) self.delta_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) # split stack into the batches for each model --> assume each model receives a batch of the same size self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph, self.num_models, axis=0) self.act_model_batches = tf.split(self.act_model_batches_stack_ph, self.num_models, axis=0) self.delta_model_batches = tf.split( self.delta_model_batches_stack_ph, self.num_models, axis=0) # reuse previously created MLP but each model receives its own batch delta_preds = [] var_preds = [] self.obs_next_pred = [] self.loss_model_batches = [] self.train_op_model_batches = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): # concatenate action and observation --> NN input nn_input = tf.concat( [self.obs_model_batches[i], self.act_model_batches[i]], axis=1) mlp = MLP(name + '/model_{}'.format(i), output_dim=2 * obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=nn_input, input_dim=obs_space_dims + action_space_dims, weight_normalization=weight_normalization) mean, logvar = tf.split(mlp.output_var, 2, axis=-1) logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar) logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) var = tf.exp(logvar) inv_var = tf.exp(-logvar) loss = tf.reduce_mean( tf.square(self.delta_model_batches[i] - mean) * inv_var + logvar) loss += (0.01 * tf.reduce_mean(self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar)) delta_preds.append(mean) var_preds.append(var) self.loss_model_batches.append(loss) self.train_op_model_batches.append( optimizer(learning_rate=self.learning_rate).minimize(loss)) self.delta_pred_model_batches_stack = tf.concat( delta_preds, axis=0) # shape: (batch_size_per_model*num_models, ndim_obs) self.var_pred_model_batches_stack = tf.concat(var_preds, axis=0) # tensor_utils self.f_delta_pred_model_batches = compile_function([ self.obs_model_batches_stack_ph, self.act_model_batches_stack_ph ], self.delta_pred_model_batches_stack) self.f_var_pred_model_batches = compile_function([ self.obs_model_batches_stack_ph, self.act_model_batches_stack_ph ], self.var_pred_model_batches_stack) self._networks = mlps
def __init__( self, name, env, hidden_sizes=(512, 512), hidden_nonlinearity='swish', output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=False, # Doesn't work normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, rolling_average_persitency=0.99, buffer_size=50000, ): Serializable.quick_init(self, locals()) max_logvar = .0 min_logvar = -10 self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.buffer_size = buffer_size self.batch_size = batch_size self.learning_rate = learning_rate self.name = name self._dataset_train = None self._dataset_test = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] self.hidden_nonlinearity = self._activations[hidden_nonlinearity] self.output_nonlinearity = self._activations[output_nonlinearity] self.hidden_sizes = hidden_sizes """ computation graph for training and simple inference """ with tf.variable_scope(name): self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) * max_logvar, dtype=tf.float32, name="max_logvar") self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) * min_logvar, dtype=tf.float32, name="min_logvar") # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self._create_stats_vars() # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) # create MLP delta_preds = [] var_preds = [] self.obs_next_pred = [] with tf.variable_scope('dynamics_model'): mlp = MLP( name, output_dim=2 * obs_space_dims, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=self.nn_input, input_dim=obs_space_dims + action_space_dims, ) mean, logvar = tf.split(mlp.output_var, 2, axis=-1) logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar) logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar) var = tf.exp(logvar) self.delta_pred = mean self.var_pred = var # define loss and train_op self.loss = tf.reduce_mean((self.delta_ph - self.delta_pred)**2 / self.var_pred + tf.log(self.var_pred)) self.loss += 0.01 * tf.reduce_mean( self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar) self.optimizer = optimizer(learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) self.f_var_pred = compile_function([self.obs_ph, self.act_ph], self.var_pred) """ computation graph for inference where each of the models receives a different batch""" self._networks = [mlp]
def __init__( self, name, env, num_models=5, hidden_sizes=(512, 512), hidden_nonlinearity='swish', output_nonlinearity=None, batch_size=500, learning_rate=0.001, weight_normalization=False, # Doesn't work normalize_input=True, optimizer=tf.train.AdamOptimizer, valid_split_ratio=0.2, # 0.1 rolling_average_persitency=0.99, buffer_size=50000, loss_str='MSE', ): Serializable.quick_init(self, locals()) max_logvar = 1 min_logvar = 0.1 self.normalization = None self.normalize_input = normalize_input self.next_batch = None self.valid_split_ratio = valid_split_ratio self.rolling_average_persitency = rolling_average_persitency self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio)) self.buffer_size_test = int(buffer_size * valid_split_ratio) self.batch_size = batch_size self.learning_rate = learning_rate self.num_models = num_models self.hidden_sizes = hidden_sizes self.name = name self._dataset_train = None self._dataset_test = None # determine dimensionality of state and action space self.obs_space_dims = obs_space_dims = env.observation_space.shape[0] self.action_space_dims = action_space_dims = env.action_space.shape[0] self.timesteps_counter = 0 self.used_timesteps_counter = 0 self.hidden_nonlinearity = hidden_nonlinearity = self._activations[ hidden_nonlinearity] self.output_nonlinearity = output_nonlinearity = self._activations[ output_nonlinearity] """ computation graph for training and simple inference """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): # placeholders self.obs_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self.act_ph = tf.placeholder(tf.float32, shape=(None, action_space_dims)) self.delta_ph = tf.placeholder(tf.float32, shape=(None, obs_space_dims)) self._create_stats_vars() # concatenate action and observation --> NN input self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1) obs_ph = tf.split(self.nn_input, self.num_models, axis=0) # create MLP mlps = [] delta_preds = [] self.obs_next_pred = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=tf.AUTO_REUSE): mlp = MLP( name + '/model_{}'.format(i), output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=obs_ph[i], input_dim=obs_space_dims + action_space_dims, ) mlps.append(mlp) delta_preds.append(mlp.output_var) self.delta_pred = tf.stack( delta_preds, axis=2) # shape: (batch_size, ndim_obs, n_models) # define loss and train_op if loss_str == 'L2': self.loss = tf.reduce_mean( tf.linalg.norm(self.delta_ph[:, :, None] - self.delta_pred, axis=1)) elif loss_str == 'MSE': self.loss = tf.reduce_mean( (self.delta_ph[:, :, None] - self.delta_pred)**2) else: raise NotImplementedError self.optimizer = optimizer(learning_rate=self.learning_rate) self.train_op = self.optimizer.minimize(self.loss) # tensor_utils self.f_delta_pred = compile_function([self.obs_ph, self.act_ph], self.delta_pred) """ computation graph for inference where each of the models receives a different batch""" with tf.variable_scope(name, reuse=True): # placeholders self.obs_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) self.act_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, action_space_dims)) self.delta_model_batches_stack_ph = tf.placeholder( tf.float32, shape=(None, obs_space_dims)) # split stack into the batches for each model --> assume each model receives a batch of the same size self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph, self.num_models, axis=0) self.act_model_batches = tf.split(self.act_model_batches_stack_ph, self.num_models, axis=0) self.delta_model_batches = tf.split( self.delta_model_batches_stack_ph, self.num_models, axis=0) # reuse previously created MLP but each model receives its own batch delta_preds = [] self.obs_next_pred = [] self.loss_model_batches = [] self.train_op_model_batches = [] for i in range(num_models): with tf.variable_scope('model_{}'.format(i), reuse=True): # concatenate action and observation --> NN input nn_input = tf.concat( [self.obs_model_batches[i], self.act_model_batches[i]], axis=1) mlp = MLP(name + '/model_{}'.format(i), output_dim=obs_space_dims, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_var=nn_input, input_dim=obs_space_dims + action_space_dims, weight_normalization=weight_normalization) delta_preds.append(mlp.output_var) if loss_str == 'L2': loss = tf.reduce_mean( tf.linalg.norm(self.delta_model_batches[i] - mlp.output_var, axis=1)) elif loss_str == 'MSE': loss = tf.reduce_mean( (self.delta_model_batches[i] - mlp.output_var)**2) else: raise NotImplementedError self.loss_model_batches.append(loss) self.train_op_model_batches.append( optimizer(learning_rate=self.learning_rate).minimize(loss)) self.delta_pred_model_batches_stack = tf.concat( delta_preds, axis=0) # shape: (batch_size_per_model*num_models, ndim_obs) # tensor_utils self.f_delta_pred_model_batches = compile_function([ self.obs_model_batches_stack_ph, self.act_model_batches_stack_ph ], self.delta_pred_model_batches_stack) self._networks = mlps