def __init__(self, scope, config, action_size, nb_states): self._scope = scope self._conv_layers = config.conv_layers self._fc_layers = config.fc_layers self._action_size = action_size self._nb_options = config.nb_options self._nb_envs = config.num_agents self._config = config self.option = 0 self._sf_layers = config.sf_layers self._deconv_layers = config.deconv_layers self._network_optimizer = config.network_optimizer( self._config.lr, name='network_optimizer') with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[None, nb_states], dtype=tf.float32, name="Inputs") self.sf = layers.fully_connected(self.observation, num_outputs=nb_states, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf") if scope != 'global': # self.actions_placeholder = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions") # self.actions_onehot = tf.one_hot(self.actions_placeholder, self._action_size, dtype=tf.float32, # name="Actions_Onehot") self.target_sf = tf.placeholder(shape=[None, nb_states], dtype=tf.float32, name="target_SF") with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(tf.square(sf_td_error)) self.loss = self.sf_loss # + self.instant_r_loss loss_summaries = [tf.summary.scalar('avg_sf_loss', self.sf_loss)] local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm(gradients, self._config.gradient_clip_value) # for grad, weight in zip(grads, local_vars): # if grad is not None: # self.summaries.append(tf.summary.histogram(weight.name + '_grad', grad)) # self.summaries.append(tf.summary.histogram(weight.name, weight)) self.merged_summary = tf.summary.merge(loss_summaries + [ tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), tf.summary.scalar('cliped_gradient_norm', tf.global_norm(grads)), gradient_summaries(zip(grads, local_vars))]) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads = self._network_optimizer.apply_gradients(zip(grads, global_vars))
def __init__(self, scope, config, action_size, nb_states): self._scope = scope self.nb_states = nb_states # self.conv_layers = config.conv_layers self.fc_layers = config.fc_layers self.sf_layers = config.sf_layers self.aux_fc_layers = config.aux_fc_layers # self.aux_deconv_layers = config.aux_deconv_layers self.action_size = action_size self.nb_options = config.nb_options self.nb_envs = config.num_agents self.config = config self.network_optimizer = config.network_optimizer( self.config.lr, name='network_optimizer') with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[None, self.nb_states], dtype=tf.float32, name="Inputs") self.summaries = [] out = self.observation with tf.variable_scope("sf"): for i, nb_filt in enumerate(self.sf_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf_{}".format(i)) # if i < len(self.sf_layers) - 1: # # out = layer_norm_fn(out, relu=True) # out = tf.nn.relu(out) # self.summaries.append(tf.contrib.layers.summarize_activation(out)) self.sf = out if scope != 'target': self.target_sf = tf.placeholder( shape=[None, self.sf_layers[-1]], dtype=tf.float32, name="target_SF") self.matrix_sf = tf.placeholder( shape=[self.nb_states, self.sf_layers[-1]], dtype=tf.float32, name="matrix_sf") self.s, self.u, self.v = tf.svd(self.matrix_sf) self.matrix_fi = tf.placeholder( shape=[self.nb_states, self.fc_layers[-1]], dtype=tf.float32, name="matrix_fi") self.statistics_summaries = [] self.statistics_summaries.append( tf.summary.image('input', self.observation * 255, max_outputs=30)) with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(tf.square(sf_td_error)) self.loss = self.sf_loss loss_summaries = [ tf.summary.scalar('avg_sf_loss', self.sf_loss) ] local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( gradients, self.config.gradient_clip_value) self.merged_summary = tf.summary.merge( self.summaries + loss_summaries + [ tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), tf.summary.scalar('cliped_gradient_norm', tf.global_norm(grads)), gradient_summaries(zip(grads, local_vars)) ]) self.apply_grads = self.network_optimizer.apply_gradients( zip(grads, local_vars))
def __init__(self, scope, config, action_size, nb_states): self._scope = scope self.nb_states = nb_states # self.conv_layers = config.conv_layers self.fc_layers = config.fc_layers self.sf_layers = config.sf_layers self.aux_fc_layers = config.aux_fc_layers # self.aux_deconv_layers = config.aux_deconv_layers self.action_size = action_size self.nb_options = config.nb_options self.nb_envs = config.num_agents self.config = config self.network_optimizer = config.network_optimizer( self.config.lr, name='network_optimizer') with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[ None, config.input_size[0], config.input_size[1], config.history_size ], dtype=tf.float32, name="Inputs") self.image_summaries = [] self.image_summaries.append( tf.summary.image('input', self.observation, max_outputs=30)) self.summaries = [] out = self.observation out = layers.flatten(out, scope="flatten") with tf.variable_scope("fc"): for i, nb_filt in enumerate(self.fc_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) if i < len(self.fc_layers) - 1: # out = layer_norm_fn(out, relu=True) out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) self.fi = out with tf.variable_scope("sf"): out = tf.stop_gradient(tf.nn.relu(self.fi)) for i, nb_filt in enumerate(self.sf_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf_{}".format(i)) if i < len(self.sf_layers) - 1: out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) self.sf = out with tf.variable_scope("action_fc"): self.actions_placeholder = tf.placeholder(shape=[None], dtype=tf.float32, name="Actions") actions = layers.fully_connected( self.actions_placeholder[..., None], num_outputs=self.fc_layers[-1], activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="action_fc{}".format(i)) with tf.variable_scope("aux_fc"): out = tf.add(self.fi, actions) # out = tf.nn.relu(out) for i, nb_filt in enumerate(self.aux_fc_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="aux_fc_{}".format(i)) if i < len(self.aux_fc_layers) - 1: out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) self.next_obs = tf.reshape( out, (-1, config.input_size[0], config.input_size[1], config.history_size)) self.image_summaries.append( tf.summary.image('next_obs', self.next_obs, max_outputs=30)) if scope != 'target': # self.target_q_a = tf.placeholder(shape=[None], dtype=tf.float32, name="target_Q_a") # self.actions_onehot = tf.one_hot(tf.cast(self.actions_placeholder, tf.int32), self.action_size + 1, # dtype=tf.float32, name="actions_one_hot") # self.q_a = tf.reduce_sum(tf.multiply(self.q, self.actions_onehot), # reduction_indices=1, name="Q_a") # with tf.name_scope('q_loss'): # td_error = self.q_a - self.target_q_a # self.q_loss = tf.reduce_mean(huber_loss(td_error)) # self.pix_state /= 255. # self.pix_state -= 0.5 # self.pix_state *= 2. self.target_sf = tf.placeholder( shape=[None, self.sf_layers[-1]], dtype=tf.float32, name="target_SF") self.target_next_obs = tf.placeholder(shape=[ None, config.input_size[0], config.input_size[1], config.history_size ], dtype=tf.float32, name="target_next_obs") self.image_summaries.append( tf.summary.image('target_next_obs', self.target_next_obs, max_outputs=30)) self.matrix_sf = tf.placeholder( shape=[self.nb_states, self.sf_layers[-1]], dtype=tf.float32, name="matrix_sf") self.s, self.u, self.v = tf.svd(self.matrix_sf) with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(huber_loss(sf_td_error)) with tf.name_scope('aux_loss'): aux_error = self.next_obs - self.target_next_obs self.aux_loss = tf.reduce_mean(self.config.aux_coef * huber_loss(aux_error)) regularizer_features = tf.reduce_mean(self.config.feat_decay * tf.nn.l2_loss(self.fi)) local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) regularizer_sf_weights = tf.add_n([ self.config.sf_weight_decay * tf.nn.l2_loss(w) for w in local_vars if 'sf' in w.name ]) self.loss = self.sf_loss + self.aux_loss + regularizer_features + regularizer_sf_weights loss_summaries = [ tf.summary.scalar('avg_sf_loss', self.sf_loss), tf.summary.scalar('aux_loss', self.aux_loss), tf.summary.scalar('total_loss', self.loss) ] gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( gradients, self.config.gradient_clip_value) self.merged_summary = tf.summary.merge( self.image_summaries + self.summaries + loss_summaries + [ tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), tf.summary.scalar('cliped_gradient_norm', tf.global_norm(grads)), gradient_summaries(zip(grads, local_vars)) ]) self.apply_grads = self.network_optimizer.apply_gradients( zip(grads, local_vars))
def __init__(self, scope, config, action_size, nb_states): self._scope = scope self.nb_states = nb_states self.conv_layers = config.conv_layers self.fc_layers = config.fc_layers self.sf_layers = config.sf_layers self.aux_fc_layers = config.aux_fc_layers self.aux_deconv_layers = config.aux_deconv_layers self.action_size = action_size self.nb_options = config.nb_options self.nb_envs = config.num_agents self.config = config self.network_optimizer = config.network_optimizer( self.config.lr, name='network_optimizer') with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[ None, config.input_size[0], config.input_size[1], config.history_size ], dtype=tf.float32, name="Inputs") self.image_summaries = [] if self.config.history_size == 3: self.image_summaries.append( tf.summary.image('input', self.observation * 255, max_outputs=30)) else: self.image_summaries.append( tf.summary.image('input', self.observation[:, :, :, 0:1] * 255, max_outputs=30)) self.summaries = [] out = self.observation with tf.variable_scope('conv'): for i, (kernel_size, stride, nb_kernels) in enumerate(self.conv_layers): out = layers.conv2d( out, num_outputs=nb_kernels, kernel_size=kernel_size, stride=stride, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="conv_{}".format(i)) out = layer_norm_fn(out, relu=True) self.summaries.append( tf.contrib.layers.summarize_activation(out)) out = layers.flatten(out, scope="flatten") with tf.variable_scope("fc"): for i, nb_filt in enumerate(self.fc_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) if i < len(self.fc_layers) - 1: out = layer_norm_fn(out, relu=False) # out = layer_norm_fn(out, relu=True) out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) self.fi = out out = tf.stop_gradient(layer_norm_fn(self.fi, relu=True)) # ------------------- Adding option Q --------------------- self.q = layers.fully_connected( out, num_outputs=self.action_size + 1, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="Q") with tf.variable_scope("sf"): for i, nb_filt in enumerate(self.sf_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf_{}".format(i)) if i < len(self.sf_layers) - 1: out = layer_norm_fn(out, relu=False) out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) self.sf = out out = self.fi with tf.variable_scope("action_fc"): self.actions_placeholder = tf.placeholder(shape=[None], dtype=tf.float32, name="Actions") actions = layers.fully_connected( self.actions_placeholder[..., None], num_outputs=self.fc_layers[-1], activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="action_fc{}".format(i)) out = layer_norm_fn(out, relu=False) out = tf.add(out, actions) out = layer_norm_fn(out, relu=True) with tf.variable_scope("aux_fc"): for i, nb_filt in enumerate(self.aux_fc_layers): out = layers.fully_connected( out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="aux_fc_{}".format(i)) out = layer_norm_fn(out, relu=False) if i > 0: # out = layer_norm_fn(out, relu=True) out = tf.nn.relu(out) self.summaries.append( tf.contrib.layers.summarize_activation(out)) with tf.variable_scope("aux_deconv"): decoder_out = tf.expand_dims(tf.expand_dims(out, 1), 1) for i, (kernel_size, stride, padding, nb_kernels) in enumerate(self.aux_deconv_layers): decoder_out = layers.conv2d_transpose( decoder_out, num_outputs=nb_kernels, kernel_size=kernel_size, stride=stride, activation_fn=None, padding="same" if padding > 0 else "valid", variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="aux_deconv_{}".format(i)) if i < len(self.aux_deconv_layers) - 1: decoder_out = layer_norm_fn(decoder_out, relu=False) decoder_out = tf.nn.relu(decoder_out) self.summaries.append( tf.contrib.layers.summarize_activation(decoder_out)) self.next_obs = decoder_out if self.config.history_size == 3: self.image_summaries.append( tf.summary.image('next_obs', self.next_obs * 255, max_outputs=30)) else: self.image_summaries.append( tf.summary.image('next_obs', self.next_obs[:, :, :, 0:1] * 255, max_outputs=30)) if scope != 'target': self.target_q_a = tf.placeholder(shape=[None], dtype=tf.float32, name="target_Q_a") self.actions_onehot = tf.one_hot(tf.cast( self.actions_placeholder, tf.int32), self.action_size + 1, dtype=tf.float32, name="actions_one_hot") self.q_a = tf.reduce_sum(tf.multiply(self.q, self.actions_onehot), reduction_indices=1, name="Q_a") with tf.name_scope('q_loss'): td_error = self.q_a - self.target_q_a self.q_loss = tf.reduce_mean(huber_loss(td_error)) self.target_sf = tf.placeholder( shape=[None, self.sf_layers[-1]], dtype=tf.float32, name="target_SF") self.target_next_obs = tf.placeholder(shape=[ None, config.input_size[0], config.input_size[1], config.history_size ], dtype=tf.float32, name="target_next_obs") if self.config.history_size == 3: self.image_summaries.append( tf.summary.image('target_next_obs', self.target_next_obs * 255, max_outputs=30)) else: self.image_summaries.append( tf.summary.image('target_next_obs', self.target_next_obs[:, :, :, 0:1] * 255, max_outputs=30)) self.matrix_sf = tf.placeholder( shape=[self.nb_states, self.sf_layers[-1]], dtype=tf.float32, name="matrix_sf") self.s, self.u, self.v = tf.svd(self.matrix_sf) with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(tf.square(sf_td_error)) with tf.name_scope('aux_loss'): aux_error = self.next_obs - self.target_next_obs self.aux_loss = tf.reduce_mean(self.config.aux_coef * tf.square(aux_error)) self.loss = self.sf_loss + self.aux_loss loss_summaries = [ tf.summary.scalar('avg_sf_loss', self.sf_loss), tf.summary.scalar('aux_loss', self.aux_loss), tf.summary.scalar('total_loss', self.loss) ] local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) local_vars = [v for v in local_vars if 'Q' not in v.name] gradients = tf.gradients(self.loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads, self.grad_norms = tf.clip_by_global_norm( gradients, self.config.gradient_clip_value) self.merged_summary = tf.summary.merge( self.image_summaries + self.summaries + loss_summaries + [ tf.summary.scalar('gradient_norm', tf.global_norm(gradients)), tf.summary.scalar('cliped_gradient_norm', tf.global_norm(grads)), gradient_summaries(zip(grads, local_vars)) ]) self.apply_grads = self.network_optimizer.apply_gradients( zip(grads, local_vars)) # --------------- Q_loss ---------------------- local_q_vars = [ v for v in tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) if "Q" in v.name ] q_loss_summary = [tf.summary.scalar('q_loss', self.q_loss)] q_gradients = tf.gradients(self.q_loss, local_q_vars) q_grads, self.grad_norms = tf.clip_by_global_norm( q_gradients, self.config.gradient_clip_value) self.q_merged_summary = tf.summary.merge(q_loss_summary) self.q_apply_grads = self.network_optimizer.apply_gradients( zip(q_grads, local_q_vars))
def __init__(self, scope, config, action_size, nb_states, total_steps_tensor=None): self._scope = scope self.nb_states = nb_states self.fc_layers = config.fc_layers self.sf_layers = config.sf_layers self.aux_fc_layers = config.aux_fc_layers self.action_size = action_size self.nb_options = config.nb_options self.nb_envs = config.num_agents self.config = config self.network_optimizer = config.network_optimizer( self.config.lr, name='network_optimizer') if scope == 'global': self.sf_matrix_path = os.path.join(config.stage_logdir, "sf_matrix.npy") self.directions = np.zeros((config.nb_options, config.sf_layers[-1])) if os.path.exists(self.sf_matrix_path): self.sf_matrix_buffer = np.load(self.sf_matrix_path) else: self.sf_matrix_buffer = np.zeros(shape=(self.config.sf_matrix_size, self.config.sf_layers[-1]), dtype=np.float32) # self._exploration_options = TFLinearSchedule(self._config.explore_steps, self._config.final_random_action_prob, # self._config.initial_random_action_prob) # if total_steps_tensor is not None: # self.entropy_coef = tf.train.polynomial_decay(self.config.initial_random_action_prob, total_steps_tensor, # self.config.entropy_decay_steps, # self.config.final_random_action_prob, # power=0.5) self.entropy_coef = self.config.final_random_action_prob with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[None, config.input_size[0], config.input_size[1], config.history_size], dtype=tf.float32, name="Inputs") self.steps = tf.placeholder(shape=[], dtype=tf.int32, name="steps") self.image_summaries = [] # self.image_summaries.append(tf.summary.image('input', self.observation, max_outputs=30)) self.summaries_sf = [] self.summaries_aux = [] self.summaries_option = [] out = self.observation out = layers.flatten(out, scope="flatten") with tf.variable_scope("fi"): for i, nb_filt in enumerate(self.fc_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) if i < len(self.fc_layers) - 1: out = tf.nn.relu(out) self.summaries_sf.append(tf.contrib.layers.summarize_activation(out)) self.summaries_aux.append(tf.contrib.layers.summarize_activation(out)) self.summaries_option.append(tf.contrib.layers.summarize_activation(out)) self.fi = out with tf.variable_scope("eigen_option_term"): out = tf.stop_gradient(tf.nn.relu(self.fi)) self.termination = layers.fully_connected(out, num_outputs=self.nb_options, activation_fn=tf.nn.sigmoid, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_option_term") self.summaries_option.append(tf.contrib.layers.summarize_activation(self.termination)) with tf.variable_scope("option_q_val"): out = tf.stop_gradient(tf.nn.relu(self.fi)) self.q_val = layers.fully_connected(out, num_outputs=( self.nb_options + self.action_size) if self.config.include_primitive_options else self.nb_options, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_q_val") self.summaries_option.append(tf.contrib.layers.summarize_activation(self.q_val)) self.max_q_val = tf.reduce_max(self.q_val, 1) self.max_options = tf.cast(tf.argmax(self.q_val, 1), dtype=tf.int32) self.exp_options = tf.random_uniform(shape=[tf.shape(self.q_val)[0]], minval=0, maxval=( self.nb_options + self.action_size) if self.config.include_primitive_options else self.nb_options, dtype=tf.int32) self.local_random = tf.random_uniform(shape=[tf.shape(self.q_val)[0]], minval=0., maxval=1., dtype=tf.float32, name="rand_options") self.condition = self.local_random > self.config.final_random_option_prob self.current_option = tf.where(self.condition, self.max_options, self.exp_options) self.primitive_action = tf.where(self.current_option >= self.nb_options, tf.ones_like(self.current_option), tf.zeros_like(self.current_option)) self.summaries_option.append(tf.contrib.layers.summarize_activation(self.current_option)) self.v = tf.reduce_max(self.q_val, axis=1) * (1 - self.config.final_random_option_prob) + \ self.config.final_random_option_prob * tf.reduce_mean(self.q_val, axis=1) self.summaries_option.append(tf.contrib.layers.summarize_activation(self.v)) if self.config.eigen: with tf.variable_scope("eigen_option_q_val"): out = tf.stop_gradient(tf.nn.relu(self.fi)) self.eigen_q_val = layers.fully_connected(out, num_outputs=self.nb_options, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_q_val") self.summaries_option.append(tf.contrib.layers.summarize_activation(self.eigen_q_val)) if self.config.include_primitive_options: concatenated_eigen_q = tf.concat([self.q_val[:, self.config.nb_options:], self.eigen_q_val], 1) else: concatenated_eigen_q = self.eigen_q_val self.eigenv = tf.reduce_max(concatenated_eigen_q, axis=1) * \ (1 - self.config.final_random_option_prob) + \ self.config.final_random_option_prob * tf.reduce_mean(concatenated_eigen_q, axis=1) self.summaries_option.append(tf.contrib.layers.summarize_activation(self.eigenv)) with tf.variable_scope("eigen_option_i_o_policies"): out = tf.stop_gradient(tf.nn.relu(self.fi)) self.options = [] for i in range(self.nb_options): option = layers.fully_connected(out, num_outputs=self.action_size, activation_fn=tf.nn.softmax, biases_initializer=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="policy_{}".format(i)) self.summaries_option.append(tf.contrib.layers.summarize_activation(option)) self.options.append(option) self.options = tf.stack(self.options, 1) with tf.variable_scope("sf"): out = tf.stop_gradient(tf.nn.relu(self.fi)) for i, nb_filt in enumerate(self.sf_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, biases_initializer=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf_{}".format(i)) if i < len(self.sf_layers) - 1: out = tf.nn.relu(out) self.summaries_sf.append(tf.contrib.layers.summarize_activation(out)) self.sf = out with tf.variable_scope("aux_action_fc"): self.actions_placeholder = tf.placeholder(shape=[None], dtype=tf.float32, name="Actions") actions = layers.fully_connected(self.actions_placeholder[..., None], num_outputs=self.fc_layers[-1], activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) with tf.variable_scope("aux_next_frame"): out = tf.add(self.fi, actions) # out = tf.nn.relu(out) for i, nb_filt in enumerate(self.aux_fc_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) if i < len(self.aux_fc_layers) - 1: out = tf.nn.relu(out) self.summaries_aux.append(tf.contrib.layers.summarize_activation(out)) self.next_obs = tf.reshape(out, (-1, config.input_size[0], config.input_size[1], config.history_size)) # self.image_summaries.append(tf.summary.image('next_obs', self.next_obs, max_outputs=30)) if scope != 'global': self.target_sf = tf.placeholder(shape=[None, self.sf_layers[-1]], dtype=tf.float32, name="target_SF") self.target_next_obs = tf.placeholder( shape=[None, config.input_size[0], config.input_size[1], config.history_size], dtype=tf.float32, name="target_next_obs") self.options_placeholder = tf.placeholder(shape=[None], dtype=tf.int32, name="options") self.target_eigen_return = tf.placeholder(shape=[None], dtype=tf.float32) self.target_return = tf.placeholder(shape=[None], dtype=tf.float32) self.policies = self.get_intra_option_policies(self.options_placeholder) self.responsible_actions = self.get_responsible_actions(self.policies, self.actions_placeholder) if self.config.eigen: eigen_q_val = self.get_eigen_q(self.options_placeholder) q_val = self.get_q(self.options_placeholder) o_term = self.get_o_term(self.options_placeholder) self.image_summaries.append( tf.summary.image('next', tf.concat([self.next_obs, self.target_next_obs], 2), max_outputs=30)) # self.matrix_sf = tf.placeholder(shape=[self.config.sf_matrix_size, self.sf_layers[-1]], # dtype=tf.float32, name="matrix_sf") if self.config.sf_matrix_size is None: self.config.sf_matrix_size = self.nb_states self.matrix_sf = tf.placeholder(shape=[self.config.sf_matrix_size, self.sf_layers[-1]], dtype=tf.float32, name="matrix_sf") self.eigenvalues, _, self.eigenvectors = tf.svd(self.matrix_sf) with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(huber_loss(sf_td_error)) with tf.name_scope('aux_loss'): aux_error = self.next_obs - self.target_next_obs self.aux_loss = tf.reduce_mean(self.config.aux_coef * huber_loss(aux_error)) if self.config.eigen: with tf.name_scope('eigen_critic_loss'): eigen_td_error = self.target_eigen_return - eigen_q_val self.eigen_critic_loss = tf.reduce_mean(0.5 * self.config.eigen_critic_coef * tf.square(eigen_td_error)) with tf.name_scope('critic_loss'): td_error = self.target_return - q_val self.critic_loss = tf.reduce_mean(0.5 * self.config.critic_coef * tf.square(td_error)) with tf.name_scope('termination_loss'): self.term_loss = tf.reduce_mean( o_term * (tf.stop_gradient(q_val) - tf.stop_gradient(self.v) + 0.01)) with tf.name_scope('entropy_loss'): self.entropy_loss = -self.entropy_coef * tf.reduce_mean(tf.reduce_sum(self.policies * tf.log(self.policies + 1e-7), axis=1)) with tf.name_scope('policy_loss'): self.policy_loss = -tf.reduce_mean(tf.log(self.responsible_actions + 1e-7) * tf.stop_gradient( eigen_td_error if self.config.eigen else td_error)) self.option_loss = self.policy_loss - self.entropy_loss + self.critic_loss + self.term_loss if self.config.eigen: self.option_loss += self.eigen_critic_loss local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) gradients_sf = tf.gradients(self.sf_loss, local_vars) gradients_aux = tf.gradients(self.aux_loss, local_vars) gradients_option = tf.gradients(self.option_loss, local_vars) gradients_primitive_option = tf.gradients(self.critic_loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads_sf, self.grad_norms_sf = tf.clip_by_global_norm(gradients_sf, self.config.gradient_clip_norm_value) grads_aux, self.grad_norms_aux = tf.clip_by_global_norm(gradients_aux, self.config.gradient_clip_norm_value) grads_option, self.grad_norms_option = tf.clip_by_global_norm(gradients_option, self.config.gradient_clip_norm_value) grads_primitive_option, self.grad_norms_primitive_option = tf.clip_by_global_norm(gradients_primitive_option, self.config.gradient_clip_norm_value) self.merged_summary_sf = tf.summary.merge( self.summaries_sf + [tf.summary.scalar('avg_sf_loss', self.sf_loss)] + [ tf.summary.scalar('gradient_norm_sf', tf.global_norm(gradients_sf)), tf.summary.scalar('cliped_gradient_norm_sf', tf.global_norm(grads_sf)), gradient_summaries(zip(grads_sf, local_vars))]) self.merged_summary_aux = tf.summary.merge(self.image_summaries + self.summaries_aux + [tf.summary.scalar('aux_loss', self.aux_loss)] + [ tf.summary.scalar('gradient_norm_aux', tf.global_norm(gradients_aux)), tf.summary.scalar('cliped_gradient_norm_aux', tf.global_norm(grads_aux)), gradient_summaries(zip(grads_aux, local_vars))]) options_to_merge = self.summaries_option + [tf.summary.scalar('avg_critic_loss', self.critic_loss), tf.summary.scalar('avg_termination_loss', self.term_loss), tf.summary.scalar('avg_entropy_loss', self.entropy_loss), tf.summary.scalar('avg_policy_loss', self.policy_loss), tf.summary.scalar('gradient_norm_option', tf.global_norm(gradients_option)), tf.summary.scalar('cliped_gradient_norm_option', tf.global_norm(grads_option)), gradient_summaries(zip(grads_option, local_vars))] if self.config.eigen: options_to_merge += [tf.summary.scalar('avg_eigen_critic_loss', self.eigen_critic_loss)] self.merged_summary_option = tf.summary.merge(options_to_merge) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads_sf = self.network_optimizer.apply_gradients(zip(grads_sf, global_vars)) self.apply_grads_aux = self.network_optimizer.apply_gradients(zip(grads_aux, global_vars)) self.apply_grads_option = self.network_optimizer.apply_gradients(zip(grads_option, global_vars)) self.apply_grads_primitive_option = self.network_optimizer.apply_gradients(zip(grads_primitive_option, global_vars))
def __init__(self, scope, config, action_size, nb_states): self._scope = scope # self.option = 0 self.nb_states = nb_states # self.conv_layers = config.conv_layers self.fc_layers = config.fc_layers self.sf_layers = config.sf_layers self.aux_fc_layers = config.aux_fc_layers # self.aux_deconv_layers = config.aux_deconv_layers self.action_size = action_size self.nb_options = config.nb_options self.nb_envs = config.num_agents self.config = config self.network_optimizer = config.network_optimizer( self.config.lr, name='network_optimizer') with tf.variable_scope(scope): self.observation = tf.placeholder(shape=[None, config.input_size[0], config.input_size[1], config.history_size], dtype=tf.float32, name="Inputs") self.image_summaries = [] self.image_summaries.append(tf.summary.image('input', self.observation, max_outputs=30)) self.summaries_sf = [] self.summaries_aux = [] out = self.observation out = layers.flatten(out, scope="flatten") with tf.variable_scope("fc"): for i, nb_filt in enumerate(self.fc_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="fc_{}".format(i)) if i < len(self.fc_layers) - 1: # out = layer_norm_fn(out, relu=True) out = tf.nn.relu(out) self.summaries_sf.append(tf.contrib.layers.summarize_activation(out)) self.summaries_aux.append(tf.contrib.layers.summarize_activation(out)) self.fi = out with tf.variable_scope("sf"): out = layer_norm_fn(self.fi, relu=True) out = tf.stop_gradient(out) for i, nb_filt in enumerate(self.sf_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="sf_{}".format(i)) if i < len(self.sf_layers) - 1: out = tf.nn.relu(out) self.summaries_sf.append(tf.contrib.layers.summarize_activation(out)) self.sf = out with tf.variable_scope("action_fc"): self.actions_placeholder = tf.placeholder(shape=[None], dtype=tf.float32, name="Actions") actions = layers.fully_connected(self.actions_placeholder[..., None], num_outputs=self.fc_layers[-1], activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="action_fc{}".format(i)) with tf.variable_scope("aux_fc"): out = tf.add(self.fi, actions) # out = tf.nn.relu(out) for i, nb_filt in enumerate(self.aux_fc_layers): out = layers.fully_connected(out, num_outputs=nb_filt, activation_fn=None, variables_collections=tf.get_collection("variables"), outputs_collections="activations", scope="aux_fc_{}".format(i)) if i < len(self.aux_fc_layers) - 1: out = tf.nn.relu(out) self.summaries_aux.append(tf.contrib.layers.summarize_activation(out)) self.next_obs = tf.reshape(out, (-1, config.input_size[0], config.input_size[1], config.history_size)) self.image_summaries.append(tf.summary.image('next_obs', self.next_obs, max_outputs=30)) if scope != 'global': self.target_sf = tf.placeholder(shape=[None, self.sf_layers[-1]], dtype=tf.float32, name="target_SF") self.target_next_obs = tf.placeholder( shape=[None, config.input_size[0], config.input_size[1], config.history_size], dtype=tf.float32, name="target_next_obs") self.image_summaries.append(tf.summary.image('target_next_obs', self.target_next_obs, max_outputs=30)) self.matrix_sf = tf.placeholder(shape=[self.nb_states, self.sf_layers[-1]], dtype=tf.float32, name="matrix_sf") self.s, self.u, self.v = tf.svd(self.matrix_sf) with tf.name_scope('sf_loss'): sf_td_error = self.target_sf - self.sf self.sf_loss = tf.reduce_mean(huber_loss(sf_td_error)) with tf.name_scope('aux_loss'): aux_error = self.next_obs - self.target_next_obs self.aux_loss = tf.reduce_mean(self.config.aux_coef * huber_loss(aux_error)) # regularizer_features = tf.reduce_mean(self.config.feat_decay * tf.nn.l2_loss(self.fi)) local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) gradients_sf = tf.gradients(self.sf_loss, local_vars) gradients_aux = tf.gradients(self.aux_loss, local_vars) self.var_norms = tf.global_norm(local_vars) grads_sf, self.grad_norms_sf = tf.clip_by_global_norm(gradients_sf, self.config.gradient_clip_value) grads_aux, self.grad_norms_aux = tf.clip_by_global_norm(gradients_aux, self.config.gradient_clip_value) self.merged_summary_sf = tf.summary.merge( self.summaries_sf + [tf.summary.scalar('avg_sf_loss', self.sf_loss)] + [ tf.summary.scalar('gradient_norm_sf', tf.global_norm(gradients_sf)), tf.summary.scalar('cliped_gradient_norm_sf', tf.global_norm(grads_sf)), gradient_summaries(zip(grads_sf, local_vars))]) self.merged_summary_aux = tf.summary.merge(self.image_summaries + self.summaries_aux + [tf.summary.scalar('aux_loss', self.aux_loss)] + [ tf.summary.scalar('gradient_norm_sf', tf.global_norm(gradients_aux)), tf.summary.scalar('cliped_gradient_norm_sf', tf.global_norm(grads_aux)), gradient_summaries(zip(grads_aux, local_vars))]) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') self.apply_grads_sf = self.network_optimizer.apply_gradients(zip(grads_sf, global_vars)) self.apply_grads_aux = self.network_optimizer.apply_gradients(zip(grads_aux, global_vars))