def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), the action (u), the next observation (o_2), the next goal (g_2) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.o_2_tf = inputs_tf['o_2'] self.g_2_tf = inputs_tf['g_2'] # Prepare inputs for actor and critic. o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) o_2 = self.o_stats.normalize(self.o_2_tf) g_2 = self.g_stats.normalize(self.g_2_tf) # Networks. with tf.variable_scope('V'): input_V = tf.concat(axis=1, values=[o, g]) self.V_tf = nn(input_V, [self.hidden] * self.layers + [1]) input_V_2 = tf.concat(axis=1, values=[o_2, g_2]) self.V_2_tf = nn(input_V_2, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, sac, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.z_tf = inputs_tf['z'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) z = self.z_tf input_pi = tf.concat(axis=1, values=[o, z, g]) # for actor # policy net if sac: with tf.variable_scope('pi'): mu, pi, logp_pi = mlp_gaussian_policy(input_pi, self.dimu, self.hidden, self.layers) mu, pi, self.logp_pi_tf = apply_squashing_func(mu, pi, logp_pi) # make sure actions are in correct range self.mu_tf = mu * self.max_u self.pi_tf = pi * self.max_u self.neg_logp_pi_tf = -self.logp_pi_tf else: # ddpg with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) # Q value net with tf.variable_scope('Q'): # for policy training input_Q = tf.concat(axis=1, values=[o, z, g, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat(axis=1, values=[o, z, g, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, image_input_shapes, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) o = tf.reshape(o, [-1, *image_input_shapes['o']]) g = tf.reshape(g, [-1, *image_input_shapes['g']]) #print(o.shape) #input("--------------------") # input_pi = tf.concat(axis=1, values=[o, g]) # for actor # Networks. x_o = cnn_one_stream(o, scope='phi', reuse=False) #print(x_o.shape) #input("----------------") #x_g = cnn_one_stream(g, scope='phi', reuse=True) x_g = g x_concat = tf.concat(axis=1, values=[x_o, x_g]) with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(x_concat, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat(axis=1, values=[x_concat, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat(axis=1, values=[x_concat, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, normalize_obs=True, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. if normalize_obs: o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) else: o = self.o_tf g = self.g_tf input_pi = tf.concat(axis=1, values=[o, g]) # for actor # Networks. with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def mlp_gaussian_policy(x, act_dim, hidden, layers): net = nn(x, [hidden] * (layers + 1)) mu = tf.layers.dense(net, act_dim, activation=None) log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) num_blocks = (o.get_shape().as_list()[1] - ENV_FEATURES) // BLOCK_FEATURES obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES]) obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1]) #print('######################', obs_blocks) input_blocks = tf.reshape(obs_blocks, [-1, num_blocks, BLOCK_FEATURES]) #print('######################', input_blocks) to_concat = [] batch_size = tf.shape(obs_blocks)[0] with tf.variable_scope('Q'): for _ in range(ATTENTION_CNT): block_mlp = [64] obs_blocks = input_blocks for num_hidden in block_mlp: obs_blocks = tf.layers.dense(obs_blocks, num_hidden, activation=tf.nn.relu) #print('###########', obs_blocks) obs_blocks = tf.layers.dense(obs_blocks, FEATURE_SIZE, activation=None) rnn_input = tf.unstack(tf.transpose(obs_blocks, perm=[1, 0, 2])) RNN_HIDDEN = FEATURE_SIZE lstm = tf.contrib.rnn.LSTMCell(RNN_HIDDEN, state_is_tuple=True) #print('###########batch', batch_size, RNN_HIDDEN) hid_state = tf.zeros([batch_size, RNN_HIDDEN]) cell_state = tf.zeros([batch_size, RNN_HIDDEN]) state = (hid_state, cell_state) #out = tf.scan(lambda a, x: lstm(x, a), rnn_input, initializer=hid_state) #print('#####ghts) #out', out) blocks = [] for block in rnn_input: output, state = lstm(block, state) blocks.append(output) #print('#####', output) #print('#####', state) blocks = tf.stack(blocks) blocks = tf.transpose(blocks, perm=[1, 0, 2]) # Add all the blocks together # (?, n) sum_blocks = tf.reduce_sum(blocks, axis=1) #attention_input = tf.concat(axis=2, values=[obs_blocks, sum_blocks]) #print('$$$$$$$', attention_input) sum_mlp = [64] for num_hidden in sum_mlp: sum_blocks = tf.layers.dense(sum_blocks, num_hidden, activation=tf.nn.tanh) sum_blocks = tf.layers.dense(sum_blocks, FEATURE_SIZE, activation=None) print(sum_blocks) # (?, 1, n) attention = tf.expand_dims(sum_blocks, 1) #print('###########', attention) # (?, ?, n) attention = tf.tile(attention, [1, num_blocks, 1]) #print('###########', attention) attention = tf.nn.l2_normalize(attention, axis=2) #print('###########', attention) # (?, ?) norm_block_emb = tf.nn.l2_normalize(blocks, axis=2) #print('###########', attention) #print('###########', norm_block_emb) weights = tf.reduce_sum(attention * norm_block_emb, axis=2) weights = tf.nn.softmax(weights, axis=1) print('###########', weights) sindex = tf.argmax(weights, axis=1, output_type=tf.int32) print('###########', sindex) findex = tf.range(tf.shape(sindex)[0]) #index = tf.stack(tf.meshgrid(tf.range(0,batch_size), tf.range(0,batch_size)) + [ sindex ], axis=2) print('###########', findex) index = tf.stack([findex, sindex]) index = tf.transpose(index, perm=[1, 0]) #sind = tf.expand_dims(ind, axis=1) print('###########', index) chosen_block = tf.gather_nd(input_blocks, index) print('###########', chosen_block) self.block_weights = weights # (?, ?, 1) weights = tf.expand_dims(weights, 2) # (?, ?, n) weights = tf.tile(weights, [1, 1, FEATURE_SIZE]) weighted = weights * blocks # (?, n) gated_obs = tf.reduce_sum(weighted, axis=1) to_concat.append(gated_obs) to_concat.append(chosen_block) gated_obs = tf.concat(axis=1, values=to_concat) input_pi = tf.concat(axis=1, values=[obs_env, gated_obs, g]) # for actor # Networks. with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat( axis=1, values=[obs_env, gated_obs, g, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat( axis=1, values=[obs_env, gated_obs, g, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. o = self.o_stats.normalize(self.o_tf) g = self.g_stats.normalize(self.g_tf) num_blocks = (o.get_shape().as_list()[1] - ENV_FEATURES) // BLOCK_FEATURES obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES]) obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1]) batch_size = tf.shape(obs_blocks)[0] print(obs_blocks) with tf.variable_scope('pi'): # (?, b) # hidden = tf.layers.dense(obs_blocks, FEATURE_SIZE, activation=tf.nn.relu) # attention_weights = tf.layers.dense(hidden, num_blocks, activation=tf.sigmoid) attention_weights = tf.layers.dense(obs_blocks, num_blocks, activation=tf.tanh) self.block_weights = attention_weights # (?, b, f) input_blocks = tf.reshape(obs_blocks, [-1, num_blocks, BLOCK_FEATURES]) # (?, b, 1) weights = tf.expand_dims(attention_weights, 2) # (?, b, f) weights = tf.tile(weights, [1, 1, BLOCK_FEATURES]) weighted = weights * input_blocks # (?, b * f) gated_obs = tf.reshape(weighted, [-1, num_blocks * BLOCK_FEATURES]) print(gated_obs) input_pi = tf.concat(axis=1, values=[obs_env, gated_obs]) # for actor # Networks. # with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] o = self.o_tf env_size = tf.constant(ENV_FEATURES, tf.int32) block_size = tf.constant(BLOCK_FEATURES, tf.int32) batch_size = tf.shape(o)[0] obs_shape = tf.shape(o)[1] max_num_blocks = tf.cast((obs_shape - env_size) / block_size, tf.int32) num_blocks = tf.reshape(tf.slice(o, [0, 0], [-1, 1]), [ -1, ]) num_blocks = tf.cast(num_blocks, tf.int32) o = tf.slice(o, [0, 1], [-1, -1]) o = self.o_stats.normalize(o) obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES]) obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1]) #print('######################', obs_blocks) input_blocks = tf.reshape(obs_blocks, [-1, max_num_blocks, BLOCK_FEATURES]) #print('######################', input_blocks) to_concat = [] with tf.variable_scope('Q'): for _ in range(ATTENTION_CNT): block_mlp = [64] obs_blocks = input_blocks for num_hidden in block_mlp: obs_blocks = tf.layers.dense(obs_blocks, num_hidden, activation=tf.nn.relu) #print('###########', obs_blocks) obs_blocks = tf.layers.dense(obs_blocks, FEATURE_SIZE, activation=None) # rnn_input = tf.transpose(obs_blocks, perm=[1,0,2]) RNN_HIDDEN = FEATURE_SIZE lstm = tf.contrib.rnn.LSTMCell(RNN_HIDDEN, state_is_tuple=True) # For loop doesn't work! Use tf.nn.dynamic_rnn instead! # https://stackoverflow.com/questions/43341374/tensorflow-dynamic-rnn-lstm-how-to-format-input blocks, _ = tf.nn.dynamic_rnn(lstm, obs_blocks, sequence_length=num_blocks, dtype=tf.float32) # Add all the blocks together # (?, n) sum_blocks = tf.reduce_sum(blocks, axis=1) sum_mlp = [64] for num_hidden in sum_mlp: sum_blocks = tf.layers.dense(sum_blocks, num_hidden, activation=tf.nn.tanh) sum_blocks = tf.layers.dense(sum_blocks, FEATURE_SIZE, activation=None) print(sum_blocks) # (?, 1, n) attention = tf.expand_dims(sum_blocks, 1) #print('###########', attention) # (?, ?, n) attention = tf.tile(attention, [1, max_num_blocks, 1]) #print('###########', attention) attention = tf.nn.l2_normalize(attention, axis=2) #print('###########', attention) # (?, ?) norm_block_emb = tf.nn.l2_normalize(blocks, axis=2) #print('###########', attention) #print('###########', norm_block_emb) weights = tf.reduce_sum(attention * norm_block_emb, axis=2) weights = tf.nn.softmax(weights, axis=1) print('###########', weights) sindex = tf.argmax(weights, axis=1, output_type=tf.int32) print('###########', sindex) findex = tf.range(tf.shape(sindex)[0]) #index = tf.stack(tf.meshgrid(tf.range(0,batch_size), tf.range(0,batch_size)) + [ sindex ], axis=2) print('###########', findex) index = tf.stack([findex, sindex]) index = tf.transpose(index, perm=[1, 0]) #sind = tf.expand_dims(ind, axis=1) print('###########', index) chosen_block = tf.gather_nd(input_blocks, index) print('###########', chosen_block) self.block_weights = weights # (?, ?, 1) weights = tf.expand_dims(weights, 2) # (?, ?, n) weights = tf.tile(weights, [1, 1, FEATURE_SIZE]) weighted = weights * blocks # (?, n) gated_obs = tf.reduce_sum(weighted, axis=1) to_concat.append(gated_obs) to_concat.append(chosen_block) gated_obs = tf.concat(axis=1, values=to_concat) input_pi = tf.concat(axis=1, values=[obs_env, gated_obs]) # for actor # Networks. with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat( axis=1, values=[obs_env, gated_obs, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat( axis=1, values=[obs_env, gated_obs, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
def __init__(self, inputs_tf, dimo, dimz, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, env_name, **kwargs): """The discriminator network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = tf.placeholder(tf.float32, shape=(None, self.dimo)) self.z_tf = tf.placeholder(tf.float32, shape=(None, self.dimz)) self.g_tf = tf.placeholder(tf.float32, shape=(None, self.dimg)) obs_tau_excludes_goal, obs_tau_achieved_goal = split_observation_tf( self.env_name, self.o_tau_tf) obs_excludes_goal, obs_achieved_goal = split_observation_tf( self.env_name, self.o_tf) # Discriminator networks with tf.variable_scope('state_mi'): # Mutual Information Neural Estimation # shuffle and concatenate x_in = obs_tau_excludes_goal y_in = obs_tau_achieved_goal y_in_tran = tf.transpose(y_in, perm=[1, 0, 2]) y_shuffle_tran = tf.random_shuffle(y_in_tran) y_shuffle = tf.transpose(y_shuffle_tran, perm=[1, 0, 2]) x_conc = tf.concat([x_in, x_in], axis=-2) y_conc = tf.concat([y_in, y_shuffle], axis=-2) # propagate the forward pass layerx = tf_layers.linear(x_conc, int(self.hidden / 2)) layery = tf_layers.linear(y_conc, int(self.hidden / 2)) layer2 = tf.nn.relu(layerx + layery) output = tf_layers.linear(layer2, 1) output = tf.nn.tanh(output) # split in T_xy and T_x_y predictions N_samples = tf.shape(x_in)[-2] T_xy = output[:, :N_samples, :] T_x_y = output[:, N_samples:, :] # compute the negative loss (maximise loss == minimise -loss) mean_exp_T_x_y = tf.reduce_mean(tf.math.exp(T_x_y), axis=-2) neg_loss = -(tf.reduce_mean(T_xy, axis=-2) - tf.math.log(mean_exp_T_x_y)) neg_loss = tf.check_numerics(neg_loss, 'check_numerics caught bad neg_loss') self.mi_tf = neg_loss with tf.variable_scope('skill_ds'): self.logits_tf = nn(obs_achieved_goal, [int(self.hidden / 2)] * self.layers + [self.dimz]) self.sk_tf = tf.nn.softmax_cross_entropy_with_logits( labels=self.z_tf, logits=self.logits_tf) self.sk_r_tf = -1 * self.sk_tf
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, env=None, n_arms=None, normalized=False, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Un-linearize the observation self.env = env.unwrapped o_normed = self.o_stats.normalize(self.o_tf) obs_dict = self.env.reshaper.unlinearize(self.o_tf) obs_dict_normed = self.env.reshaper.unlinearize(o_normed) # Prepare inputs for actor and critic. if not normalized: o = tf.layers.Flatten()(obs_dict['observation']) g = self.g_tf else: o = tf.layers.Flatten()(obs_dict_normed['observation']) g = self.g_stats.normalize(self.g_tf) #g = tf.stopgradient(g) input_pi = tf.concat(axis=1, values=[o, g]) # for actor # Networks. with tf.variable_scope('pi'): self.pi_tf = self.max_u * tf.tanh( nn(input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # for critic training input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) self._input_Q = input_Q # exposed for tests self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) total_params()
def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, n_arms, learn_kin=False, conn_type='sums', env=None, normalized=False, **kwargs): """The actor-critic network and related training code. Args: inputs_tf (dict of tensors): all necessary inputs for the network: the observation (o), the goal (g), and the action (u) dimo (int): the dimension of the observations dimg (int): the dimension of the goals dimu (int): the dimension of the actions max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly o_stats (baselines.her.Normalizer): normalizer for observations g_stats (baselines.her.Normalizer): normalizer for goals hidden (int): number of hidden units that should be used in hidden layers layers (int): number of hidden layers """ self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Calculate the gradients of g prior to normalization loss = calculate_loss(self.g_tf) dl_dg = tf.gradients(loss, self.g_tf) import pdb pdb.set_trace() # Access to the environment type self.env = env.unwrapped # N-Arms self.n_arms = n_arms # Un-linearize the observation self.env = env.unwrapped # Reshape inputs for the number of arms u = narm_reshape(self.u_tf, n_arms) # Normalize the observations and gradients observations = self.o_tf if normalized: observations = self.o_stats.normalize(observations) # Extract observations specifics obs_dict = self.env.reshaper.unlinearize(observations) o = obs_dict['observation'] gradL = [obs_dict['jacp{}'.format(i)] for i in range(n_arms)] import pdb pdb.set_trace() ######################################################################## # Solve a quadratic to get equal no of params #FIXME not working right ######################################################################## hidden = solve_quadratic(self.layers, self.dimo, self.dimg, self.dimu, o2=o[:, 0].shape.as_list()[-1], g2=1, u2=u[:, 0].shape.as_list()[-1], H=self.hidden, n=n_arms) - 1 ######################################################################## # Outputs pi_tfs = [None] * n_arms Q_pi_tfs = [None] * n_arms Q_tfs = [None] * n_arms for i in range(n_arms): # Observataions and actions o_i = o[:, i] u_i = u[:, i] # Differentiation chain for method g_i = gradL[i] # Input Pi input_pis_i = tf.concat(axis=1, values=[o_i, g_i]) with tf.variable_scope('pi{}'.format(i)): pi_tfs[i] = self.max_u * tf.tanh( nn(input_pis_i, [hidden] * self.layers + [1])) with tf.variable_scope('Q{}'.format(i)): # for policy training input_Q_1_i = tf.concat( axis=1, values=[o_i, g_i, pi_tfs[i] / self.max_u]) Q_pi_tfs[i] = nn(input_Q_1_i, [hidden] * self.layers + [1]) # for critic training input_Q_2_i = tf.concat(axis=1, values=[o_i, g_i, u_i / self.max_u]) Q_tfs[i] = nn(input_Q_2_i, [hidden] * self.layers + [1], reuse=True) #total_params() #tp = 2*(self.layers-1)*hidden*hidden + (2*(2 + 1 + 1 + self.layers) + 1)*hidden + (1 + 1) with tf.variable_scope('pi'): self.pi_tf = tf.concat(axis=1, values=pi_tfs) with tf.variable_scope('Q'): # for policy training self.Q_pi_tf = sum(Q_pi_tfs) self.Q_tf = sum(Q_tfs) total_params()