def create_cc_critic(self, hidden_value, scope, create_qs=True): """ Creates just the critic network """ scope = self.join_scopes(scope, "critic") self.create_sac_value_head( self.stream_names, hidden_value, self.num_layers, self.h_size, self.join_scopes(scope, "value"), ) self.value_vars = self.get_vars(self.join_scopes(scope, "value")) if create_qs: hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1) hidden_qp = tf.concat([hidden_value, self.output], axis=-1) self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads( self.stream_names, hidden_q, self.num_layers, self.h_size, self.join_scopes(scope, "q"), ) self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads( self.stream_names, hidden_qp, self.num_layers, self.h_size, self.join_scopes(scope, "q"), reuse=True, ) self.q_vars = self.get_vars(self.join_scopes(scope, "q")) self.critic_vars = self.get_vars(scope)
def _create_dc_actor(self, encoded: tf.Tensor) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. """ if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([encoded, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( hidden_policy, self.memory_in, self.sequence_length_ph, name="lstm_policy", ) self.memory_out = tf.identity(memory_policy_out, "recurrent_out") else: hidden_policy = encoded self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") with tf.variable_scope("policy"): distribution = MultiCategoricalDistribution( hidden_policy, self.act_size, self.action_masks) # It's important that we are able to feed_dict a value into this tensor to get the # right one-hot encoding, so we can't do identity on it. self.output = distribution.sample self.all_log_probs = tf.identity(distribution.log_probs, name="action") self.selected_actions = tf.stop_gradient( distribution.sample_onehot) # In discrete, these are onehot self.entropy = distribution.entropy self.total_log_probs = distribution.total_log_probs
def create_discrete_action_masking_layer(all_logits, action_masks, action_size): """ Creates a masking layer for the discrete actions :param all_logits: The concatenated unnormalized action probabilities for all branches :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ action_idx = [0] + list(np.cumsum(action_size)) branches_logits = [ all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] branch_masks = [ action_masks[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )
def create_forward_model( self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor ) -> None: """ Creates forward model TensorFlow ops for Curiosity module. Predicts encoded future state based on encoded current state and given action. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat( [encoded_state, self.policy.selected_actions], axis=1 ) hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish) pred_next_state = tf.layers.dense( hidden, self.encoding_size * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)), activation=None, ) squared_difference = 0.5 * tf.reduce_sum( tf.squared_difference(pred_next_state, encoded_next_state), axis=1 ) self.intrinsic_reward = squared_difference self.forward_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1] )
def create_recurrent_encoder(input_state, memory_in, sequence_length, name="lstm"): """ Builds a recurrent encoder for either state or observations (LSTM). :param sequence_length: Length of sequence to unroll. :param input_state: The input tensor to the LSTM cell. :param memory_in: The input memory to the LSTM cell. :param name: The scope of the LSTM cell. """ s_size = input_state.get_shape().as_list()[1] m_size = memory_in.get_shape().as_list()[1] lstm_input_state = tf.reshape(input_state, shape=[-1, sequence_length, s_size]) memory_in = tf.reshape(memory_in[:, :], [-1, m_size]) half_point = int(m_size / 2) with tf.variable_scope(name): rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point) lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple( memory_in[:, :half_point], memory_in[:, half_point:]) recurrent_output, lstm_state_out = tf.nn.dynamic_rnn( rnn_cell, lstm_input_state, initial_state=lstm_vector_in) recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point]) return recurrent_output, tf.concat( [lstm_state_out.c, lstm_state_out.h], axis=1)
def create_encoder( self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates the encoder for the discriminator :param state_in: The encoded observation input :param action_in: The action input :param done_in: The done flags input :param reuse: If true, the weights will be shared with the previous encoder created """ with tf.variable_scope("GAIL_model"): if self.use_actions: concat_input = tf.concat([state_in, action_in, done_in], axis=1) else: concat_input = state_in hidden_1 = tf.layers.dense( concat_input, self.h_size, activation=ModelUtils.swish, name="gail_d_hidden_1", reuse=reuse, ) hidden_2 = tf.layers.dense( hidden_1, self.h_size, activation=ModelUtils.swish, name="gail_d_hidden_2", reuse=reuse, ) z_mean = None if self.use_vail: # Latent representation z_mean = tf.layers.dense( hidden_2, self.z_size, reuse=reuse, name="gail_z_mean", kernel_initializer=ModelUtils.scaled_init(0.01), ) self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) # Sampled latent code self.z = z_mean + self.z_sigma * self.noise * self.use_noise estimate_input = self.z else: estimate_input = hidden_2 estimate = tf.layers.dense( estimate_input, 1, activation=tf.nn.sigmoid, name="gail_d_estimate", reuse=reuse, ) return estimate, z_mean, concat_input
def create_discrete_action_masking_layer( branches_logits: List[tf.Tensor], action_masks: tf.Tensor, action_size: List[int], ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates a masking layer for the discrete actions :param branches_logits: A List of the unnormalized action probabilities for each branch :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ branch_masks = ModelUtils.break_into_branches(action_masks, action_size) raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )
def _action_onehot(self, sample: tf.Tensor, act_size: List[int]) -> tf.Tensor: action_oh = tf.concat( [ tf.one_hot(sample[:, i], act_size[i]) for i in range(len(act_size)) ], axis=1, ) return action_oh
def create_inverse_model(self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor) -> None: """ Creates inverse model TensorFlow ops for Curiosity module. Predicts action taken given current and future encoded states. :param encoded_state: Tensor corresponding to encoded current state. :param encoded_next_state: Tensor corresponding to encoded next state. """ combined_input = tf.concat([encoded_state, encoded_next_state], axis=1) hidden = tf.layers.dense(combined_input, 256, activation=LearningModel.swish) if self.policy_model.brain.vector_action_space_type == "continuous": pred_action = tf.layers.dense(hidden, self.policy_model.act_size[0], activation=None) squared_difference = tf.reduce_sum( tf.squared_difference(pred_action, self.policy_model.selected_actions), axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(squared_difference, self.policy_model.mask, 2)[1]) else: pred_action = tf.concat( [ tf.layers.dense(hidden, self.policy_model.act_size[i], activation=tf.nn.softmax) for i in range(len(self.policy_model.act_size)) ], axis=1, ) cross_entropy = tf.reduce_sum( -tf.log(pred_action + 1e-10) * self.policy_model.selected_actions, axis=1, ) self.inverse_loss = tf.reduce_mean( tf.dynamic_partition(cross_entropy, self.policy_model.mask, 2)[1])
def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( brain, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, seed, vis_encode_type, ) self.share_ac_cnn = False if self.use_recurrent: self.create_memory_ins(self.m_size) hidden_policy, hidden_critic = self.create_observation_ins( vis_encode_type, self.share_ac_cnn ) if brain.vector_action_space_type == "continuous": self.create_cc_actor(hidden_policy, POLICY_SCOPE) self.create_cc_critic(hidden_critic, POLICY_SCOPE) else: self.create_dc_actor(hidden_policy, POLICY_SCOPE) self.create_dc_critic(hidden_critic, POLICY_SCOPE) if self.share_ac_cnn: # Make sure that the policy also contains the CNN self.policy_vars += self.get_vars( self.join_scopes(POLICY_SCOPE, "critic/value/main_graph_0_encoder0") ) if self.use_recurrent: mem_outs = [ self.value_memory_out, self.q1_memory_out, self.q2_memory_out, self.policy_memory_out, ] self.memory_out = tf.concat(mem_outs, axis=1)
def _create_policy_branches(self, logits: tf.Tensor, act_size: List[int]) -> List[tf.Tensor]: policy_branches = [] for size in act_size: policy_branches.append( tf.layers.dense( logits, size, activation=None, use_bias=False, kernel_initializer=ModelUtils.scaled_init(0.01), )) unmasked_log_probs = tf.concat(policy_branches, axis=1) return unmasked_log_probs
def __init__( self, brain, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, seed=0, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( brain, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, seed, vis_encode_type, ) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") self.value_memory_in = self.memory_in with tf.variable_scope(TARGET_SCOPE): hidden_streams = self.create_observation_streams( 1, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["critic/value/"], ) if brain.vector_action_space_type == "continuous": self.create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) else: self.create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) if self.use_recurrent: self.memory_out = tf.concat(self.value_memory_out, axis=1) # Needed for Barracuda to work
def __init__( self, policy, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( policy, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, vis_encode_type, ) if self.policy.use_recurrent: self._create_memory_ins(m_size) hidden_critic = self._create_observation_in(vis_encode_type) self.policy.output = self.policy.output # Use the sequence length of the policy self.sequence_length_ph = self.policy.sequence_length_ph if self.policy.use_continuous_act: self._create_cc_critic(hidden_critic, POLICY_SCOPE) else: self._create_dc_critic(hidden_critic, POLICY_SCOPE) if self.use_recurrent: mem_outs = [ self.value_memory_out, self.q1_memory_out, self.q2_memory_out ] self.memory_out = tf.concat(mem_outs, axis=1)
def create_memory_ins(self, m_size): """ Creates the memory input placeholders for LSTM. :param m_size: the total size of the memory. """ # Create the Policy input separate from the rest # This is so in inference we only have to run the Policy network. # Barracuda will grab the recurrent_in and recurrent_out named tensors. self.inference_memory_in = tf.placeholder( shape=[None, m_size // 4], dtype=tf.float32, name="recurrent_in" ) # We assume m_size is divisible by 4 # Create the non-Policy inputs # Use a default placeholder here so nothing has to be provided during # Barracuda inference. Note that the default value is just the tiled input # for the policy, which is thrown away. three_fourths_m_size = m_size * 3 // 4 self.other_memory_in = tf.placeholder_with_default( input=tf.tile(self.inference_memory_in, [1, 3]), shape=[None, three_fourths_m_size], name="other_recurrent_in", ) # Concat and use this as the "placeholder" # for training self.memory_in = tf.concat( [self.other_memory_in, self.inference_memory_in], axis=1 ) # Re-break-up for each network num_mems = 4 mem_ins = [] for i in range(num_mems): _start = m_size // num_mems * i _end = m_size // num_mems * (i + 1) mem_ins.append(self.memory_in[:, _start:_end]) self.value_memory_in = mem_ins[0] self.q1_memory_in = mem_ins[1] self.q2_memory_in = mem_ins[2] self.policy_memory_in = mem_ins[3]
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32) if self.policy.behavior_spec.action_spec.is_continuous(): action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder(shape=[None, action_length], dtype=tf.float32) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder(shape=[None, action_length], dtype=tf.int32) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, )
def create_dc_actor(self, hidden_policy, scope): """ Creates Discrete control actor for SAC. :param hidden_policy: Output of feature extractor (i.e. the input for vector obs, output of CNN for visual obs). :param num_layers: TF scope to assign whatever is created in this block. """ scope = self.join_scopes(scope, "policy") # Create inputs outside of the scope self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") with tf.variable_scope(scope): hidden_policy = self.create_vector_observation_encoder( hidden_policy, self.h_size, self.activ_fn, self.num_layers, "encoder", False, ) if self.use_recurrent: prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([hidden_policy, prev_action_oh], axis=1) hidden_policy, memory_out = self.create_recurrent_encoder( hidden_policy, self.policy_memory_in, self.sequence_length, name="lstm_policy", ) self.policy_memory_out = memory_out with tf.variable_scope(scope): policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_policy, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling( 0.01), )) all_logits = tf.concat(policy_branches, axis=1, name="action_probs") output, normalized_probs, normalized_logprobs = self.create_discrete_action_masking_layer( all_logits, self.action_masks, self.act_size) self.action_probs = normalized_probs # Really, this is entropy, but it has an analogous purpose to the log probs in the # continuous case. self.all_log_probs = self.action_probs * normalized_logprobs self.output = output # Create action input (discrete) self.action_holder = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.output_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # For Curiosity and GAIL to retrieve selected actions. We don't # need the mask at this point because it's already stored in the buffer. self.selected_actions = tf.stop_gradient(self.output_oh) self.external_action_in = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) # This is total entropy over all branches self.entropy = -1 * tf.reduce_sum(self.all_log_probs, axis=1) # Extract the normalized logprobs for Barracuda self.normalized_logprobs = tf.identity(normalized_logprobs, name="action") # We kept the LSTMs at a different scope than the rest, so add them if they exist. self.policy_vars = self.get_vars(scope) if self.use_recurrent: self.policy_vars += self.get_vars("lstm")
def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: """ Creates state encoders for current and future observations. Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction See https://arxiv.org/abs/1705.05363 for more details. :return: current and future state encoder tensors. """ encoded_state_list = [] encoded_next_state_list = [] # Create input ops for next (t+1) visual observations. self.next_vector_in, self.next_visual_in = ModelUtils.create_input_placeholders( self.policy.behavior_spec.observation_shapes, name_prefix="curiosity_next_") if self.next_visual_in: visual_encoders = [] next_visual_encoders = [] for i, (vis_in, next_vis_in) in enumerate( zip(self.policy.visual_in, self.next_visual_in)): # Create the encoder ops for current and next visual input. # Note that these encoders are siamese. encoded_visual = ModelUtils.create_visual_observation_encoder( vis_in, self.encoding_size, ModelUtils.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), False, ) encoded_next_visual = ModelUtils.create_visual_observation_encoder( next_vis_in, self.encoding_size, ModelUtils.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), True, ) visual_encoders.append(encoded_visual) next_visual_encoders.append(encoded_next_visual) hidden_visual = tf.concat(visual_encoders, axis=1) hidden_next_visual = tf.concat(next_visual_encoders, axis=1) encoded_state_list.append(hidden_visual) encoded_next_state_list.append(hidden_next_visual) if self.policy.vec_obs_size > 0: encoded_vector_obs = ModelUtils.create_vector_observation_encoder( self.policy.vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", False, ) encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder( self.next_vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", True, ) encoded_state_list.append(encoded_vector_obs) encoded_next_state_list.append(encoded_next_vector_obs) encoded_state = tf.concat(encoded_state_list, axis=1) encoded_next_state = tf.concat(encoded_next_state_list, axis=1) return encoded_state, encoded_next_state
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_expert = tf.expand_dims(self.done_expert_holder, -1) self.done_policy = tf.expand_dims(self.done_policy_holder, -1) if self.policy.behavior_spec.is_action_continuous(): action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.float32 ) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.int32 ) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, ) encoded_policy_list = [] encoded_expert_list = [] ( self.obs_in_expert, self.expert_visual_in, ) = ModelUtils.create_input_placeholders( self.policy.behavior_spec.observation_shapes, "gail_" ) if self.policy.vec_obs_size > 0: if self.policy.normalize: encoded_expert_list.append( ModelUtils.normalize_vector_obs( self.obs_in_expert, self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) ) encoded_policy_list.append(self.policy.processed_vector_in) else: encoded_expert_list.append(self.obs_in_expert) encoded_policy_list.append(self.policy.vector_in) if self.expert_visual_in: visual_policy_encoders = [] visual_expert_encoders = [] for i, (vis_in, exp_vis_in) in enumerate( zip(self.policy.visual_in, self.expert_visual_in) ): encoded_policy_visual = ModelUtils.create_visual_observation_encoder( vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", False, ) encoded_expert_visual = ModelUtils.create_visual_observation_encoder( exp_vis_in, self.encoding_size, ModelUtils.swish, 1, f"gail_stream_{i}_visual_obs_encoder", True, ) visual_policy_encoders.append(encoded_policy_visual) visual_expert_encoders.append(encoded_expert_visual) hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) encoded_policy_list.append(hidden_policy_visual) encoded_expert_list.append(hidden_expert_visual) self.encoded_expert = tf.concat(encoded_expert_list, axis=1) self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
def __init__( self, policy, m_size=None, h_size=128, normalize=False, use_recurrent=False, num_layers=2, stream_names=None, vis_encode_type=EncoderType.SIMPLE, ): super().__init__( policy, m_size, h_size, normalize, use_recurrent, num_layers, stream_names, vis_encode_type, ) with tf.variable_scope(TARGET_SCOPE): self.visual_in = ModelUtils.create_visual_input_placeholders( policy.brain.camera_resolutions ) self.vector_in = ModelUtils.create_vector_input(policy.vec_obs_size) if self.policy.normalize: normalization_tensors = ModelUtils.create_normalizer(self.vector_in) self.update_normalization_op = normalization_tensors.update_op self.normalization_steps = normalization_tensors.steps self.running_mean = normalization_tensors.running_mean self.running_variance = normalization_tensors.running_variance self.processed_vector_in = ModelUtils.normalize_vector_obs( self.vector_in, self.running_mean, self.running_variance, self.normalization_steps, ) else: self.processed_vector_in = self.vector_in self.update_normalization_op = None if self.policy.use_recurrent: self.memory_in = tf.placeholder( shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in" ) self.value_memory_in = self.memory_in hidden_streams = ModelUtils.create_observation_streams( self.visual_in, self.processed_vector_in, 1, self.h_size, 0, vis_encode_type=vis_encode_type, stream_scopes=["critic/value/"], ) if self.policy.use_continuous_act: self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) else: self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False) if self.use_recurrent: self.memory_out = tf.concat( self.value_memory_out, axis=1 ) # Needed for Barracuda to work
def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]: """ Creates state encoders for current and future observations. Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction See https://arxiv.org/abs/1705.05363 for more details. :return: current and future state encoder tensors. """ encoded_state_list = [] encoded_next_state_list = [] if self.policy.vis_obs_size > 0: self.next_visual_in = [] visual_encoders = [] next_visual_encoders = [] for i in range(self.policy.vis_obs_size): # Create input ops for next (t+1) visual observations. next_visual_input = ModelUtils.create_visual_input( self.policy.brain.camera_resolutions[i], name="curiosity_next_visual_observation_" + str(i), ) self.next_visual_in.append(next_visual_input) # Create the encoder ops for current and next visual input. # Note that these encoders are siamese. encoded_visual = ModelUtils.create_visual_observation_encoder( self.policy.visual_in[i], self.encoding_size, ModelUtils.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), False, ) encoded_next_visual = ModelUtils.create_visual_observation_encoder( self.next_visual_in[i], self.encoding_size, ModelUtils.swish, 1, "curiosity_stream_{}_visual_obs_encoder".format(i), True, ) visual_encoders.append(encoded_visual) next_visual_encoders.append(encoded_next_visual) hidden_visual = tf.concat(visual_encoders, axis=1) hidden_next_visual = tf.concat(next_visual_encoders, axis=1) encoded_state_list.append(hidden_visual) encoded_next_state_list.append(hidden_next_visual) if self.policy.vec_obs_size > 0: # Create the encoder ops for current and next vector input. # Note that these encoders are siamese. # Create input op for next (t+1) vector observation. self.next_vector_in = tf.placeholder( shape=[None, self.policy.vec_obs_size], dtype=tf.float32, name="curiosity_next_vector_observation", ) encoded_vector_obs = ModelUtils.create_vector_observation_encoder( self.policy.vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", False, ) encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder( self.next_vector_in, self.encoding_size, ModelUtils.swish, 2, "curiosity_vector_obs_encoder", True, ) encoded_state_list.append(encoded_vector_obs) encoded_next_state_list.append(encoded_next_vector_obs) encoded_state = tf.concat(encoded_state_list, axis=1) encoded_next_state = tf.concat(encoded_next_state_list, axis=1) return encoded_state, encoded_next_state
def create_observation_streams( visual_in: List[tf.Tensor], vector_in: tf.Tensor, num_streams: int, h_size: int, num_layers: int, vis_encode_type: EncoderType = EncoderType.SIMPLE, stream_scopes: List[str] = None, ) -> List[tf.Tensor]: """ Creates encoding stream for observations. :param num_streams: Number of streams to create. :param h_size: Size of hidden linear layers in stream. :param num_layers: Number of hidden linear layers in stream. :param stream_scopes: List of strings (length == num_streams), which contains the scopes for each of the streams. None if all under the same TF scope. :return: List of encoded streams. """ activation_fn = ModelUtils.swish vector_observation_input = vector_in final_hiddens = [] for i in range(num_streams): # Pick the encoder function based on the EncoderType create_encoder_func = ModelUtils.get_encoder_for_type( vis_encode_type) visual_encoders = [] hidden_state, hidden_visual = None, None _scope_add = stream_scopes[i] if stream_scopes else "" if len(visual_in) > 0: for j, vis_in in enumerate(visual_in): ModelUtils._check_resolution_for_encoder( vis_in, vis_encode_type) encoded_visual = create_encoder_func( vis_in, h_size, activation_fn, num_layers, f"{_scope_add}main_graph_{i}_encoder{j}", # scope False, # reuse ) visual_encoders.append(encoded_visual) hidden_visual = tf.concat(visual_encoders, axis=1) if vector_in.get_shape()[-1] > 0: # Don't encode non-existant or 0-shape inputs hidden_state = ModelUtils.create_vector_observation_encoder( vector_observation_input, h_size, activation_fn, num_layers, scope=f"{_scope_add}main_graph_{i}", reuse=False, ) if hidden_state is not None and hidden_visual is not None: final_hidden = tf.concat([hidden_visual, hidden_state], axis=1) elif hidden_state is None and hidden_visual is not None: final_hidden = hidden_visual elif hidden_state is not None and hidden_visual is None: final_hidden = hidden_state else: raise Exception( "No valid network configuration possible. " "There are no states or observations in this brain") final_hiddens.append(final_hidden) return final_hiddens
def create_observation_streams( self, num_streams: int, h_size: int, num_layers: int, vis_encode_type: EncoderType = EncoderType.SIMPLE, stream_scopes: List[str] = None, ) -> List[tf.Tensor]: """ Creates encoding stream for observations. :param num_streams: Number of streams to create. :param h_size: Size of hidden linear layers in stream. :param num_layers: Number of hidden linear layers in stream. :param stream_scopes: List of strings (length == num_streams), which contains the scopes for each of the streams. None if all under the same TF scope. :return: List of encoded streams. """ brain = self.brain activation_fn = self.swish self.visual_in = [] for i in range(brain.number_visual_observations): visual_input = self.create_visual_input( brain.camera_resolutions[i], name="visual_observation_" + str(i)) self.visual_in.append(visual_input) vector_observation_input = self.create_vector_input() # Pick the encoder function based on the EncoderType create_encoder_func = LearningModel.create_visual_observation_encoder if vis_encode_type == EncoderType.RESNET: create_encoder_func = LearningModel.create_resnet_visual_observation_encoder elif vis_encode_type == EncoderType.NATURE_CNN: create_encoder_func = ( LearningModel.create_nature_cnn_visual_observation_encoder) final_hiddens = [] for i in range(num_streams): visual_encoders = [] hidden_state, hidden_visual = None, None _scope_add = stream_scopes[i] if stream_scopes else "" if self.vis_obs_size > 0: for j in range(brain.number_visual_observations): encoded_visual = create_encoder_func( self.visual_in[j], h_size, activation_fn, num_layers, scope=f"{_scope_add}main_graph_{i}_encoder{j}", reuse=False, ) visual_encoders.append(encoded_visual) hidden_visual = tf.concat(visual_encoders, axis=1) if brain.vector_observation_space_size > 0: hidden_state = self.create_vector_observation_encoder( vector_observation_input, h_size, activation_fn, num_layers, scope=f"{_scope_add}main_graph_{i}", reuse=False, ) if hidden_state is not None and hidden_visual is not None: final_hidden = tf.concat([hidden_visual, hidden_state], axis=1) elif hidden_state is None and hidden_visual is not None: final_hidden = hidden_visual elif hidden_state is not None and hidden_visual is None: final_hidden = hidden_state else: raise Exception( "No valid network configuration possible. " "There are no states or observations in this brain") final_hiddens.append(final_hidden) return final_hiddens
def create_cc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Continuous control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 2, h_size, num_layers, vis_encode_type) if self.use_recurrent: self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") _half_point = int(self.m_size / 2) hidden_policy, memory_policy_out = self.create_recurrent_encoder( hidden_streams[0], self.memory_in[:, :_half_point], self.sequence_length, name="lstm_policy", ) hidden_value, memory_value_out = self.create_recurrent_encoder( hidden_streams[1], self.memory_in[:, _half_point:], self.sequence_length, name="lstm_value", ) self.memory_out = tf.concat([memory_policy_out, memory_value_out], axis=1, name="recurrent_out") else: hidden_policy = hidden_streams[0] hidden_value = hidden_streams[1] mu = tf.layers.dense( hidden_policy, self.act_size[0], activation=None, kernel_initializer=LearningModel.scaled_init(0.01), reuse=tf.AUTO_REUSE, ) self.log_sigma_sq = tf.get_variable( "log_sigma_squared", [self.act_size[0]], dtype=tf.float32, initializer=tf.zeros_initializer(), ) sigma_sq = tf.exp(self.log_sigma_sq) self.epsilon = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon") # Clip and scale output to ensure actions are always within [-1, 1] range. self.output_pre = mu + tf.sqrt(sigma_sq) * self.epsilon output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3 self.output = tf.identity(output_post, name="action") self.selected_actions = tf.stop_gradient(output_post) # Compute probability of model output. all_probs = (-0.5 * tf.square(tf.stop_gradient(self.output_pre) - mu) / sigma_sq - 0.5 * tf.log(2.0 * np.pi) - 0.5 * self.log_sigma_sq) self.all_log_probs = tf.identity(all_probs, name="action_probs") self.entropy = 0.5 * tf.reduce_mean( tf.log(2 * np.pi * np.e) + self.log_sigma_sq) self.create_value_heads(self.stream_names, hidden_value) self.all_old_log_probs = tf.placeholder(shape=[None, self.act_size[0]], dtype=tf.float32, name="old_probabilities") # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control. self.log_probs = tf.reduce_sum((tf.identity(self.all_log_probs)), axis=1, keepdims=True) self.old_log_probs = tf.reduce_sum( (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True)
def make_inputs(self) -> None: """ Creates the input layers for the discriminator """ self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.done_expert = tf.expand_dims(self.done_expert_holder, -1) self.done_policy = tf.expand_dims(self.done_policy_holder, -1) if self.policy.brain.vector_action_space_type == "continuous": action_length = self.policy.act_size[0] self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.float32 ) self.expert_action = tf.identity(self.action_in_expert) else: action_length = len(self.policy.act_size) self.action_in_expert = tf.placeholder( shape=[None, action_length], dtype=tf.int32 ) self.expert_action = tf.concat( [ tf.one_hot(self.action_in_expert[:, i], act_size) for i, act_size in enumerate(self.policy.act_size) ], axis=1, ) encoded_policy_list = [] encoded_expert_list = [] if self.policy.vec_obs_size > 0: self.obs_in_expert = tf.placeholder( shape=[None, self.policy.vec_obs_size], dtype=tf.float32 ) if self.policy.normalize: encoded_expert_list.append( ModelUtils.normalize_vector_obs( self.obs_in_expert, self.policy.running_mean, self.policy.running_variance, self.policy.normalization_steps, ) ) encoded_policy_list.append(self.policy.processed_vector_in) else: encoded_expert_list.append(self.obs_in_expert) encoded_policy_list.append(self.policy.vector_in) if self.policy.vis_obs_size > 0: self.expert_visual_in: List[tf.Tensor] = [] visual_policy_encoders = [] visual_expert_encoders = [] for i in range(self.policy.vis_obs_size): # Create input ops for next (t+1) visual observations. visual_input = ModelUtils.create_visual_input( self.policy.brain.camera_resolutions[i], name="gail_visual_observation_" + str(i), ) self.expert_visual_in.append(visual_input) encoded_policy_visual = ModelUtils.create_visual_observation_encoder( self.policy.visual_in[i], self.encoding_size, ModelUtils.swish, 1, "gail_stream_{}_visual_obs_encoder".format(i), False, ) encoded_expert_visual = ModelUtils.create_visual_observation_encoder( self.expert_visual_in[i], self.encoding_size, ModelUtils.swish, 1, "gail_stream_{}_visual_obs_encoder".format(i), True, ) visual_policy_encoders.append(encoded_policy_visual) visual_expert_encoders.append(encoded_expert_visual) hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1) hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1) encoded_policy_list.append(hidden_policy_visual) encoded_expert_list.append(hidden_expert_visual) self.encoded_expert = tf.concat(encoded_expert_list, axis=1) self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
def _create_dc_actor(self, encoded: tf.Tensor) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. :param vis_encode_type: Type of visual encoder to use if visual input. """ if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden_policy = tf.concat([encoded, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder( hidden_policy, self.memory_in, self.sequence_length_ph, name="lstm_policy", ) self.memory_out = tf.identity(memory_policy_out, "recurrent_out") else: hidden_policy = encoded policy_branches = [] with tf.variable_scope("policy"): for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_policy, size, activation=None, use_bias=False, kernel_initializer=ModelUtils.scaled_init(0.01), )) raw_log_probs = tf.concat(policy_branches, axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") output, self.action_probs, normalized_logits = ModelUtils.create_discrete_action_masking_layer( raw_log_probs, self.action_masks, self.act_size) self.output = tf.identity(output) self.all_log_probs = tf.identity(normalized_logits, name="action") self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.action_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.selected_actions = tf.stop_gradient(self.action_oh) action_idx = [0] + list(np.cumsum(self.act_size)) self.entropy = tf.reduce_sum( (tf.stack( [ tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.nn.softmax( self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]), logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, ) self.log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, )
def __init__( self, brain, h_size=128, lr=1e-4, n_layers=2, m_size=128, normalize=False, use_recurrent=False, seed=0, ): LearningModel.__init__(self, m_size, normalize, use_recurrent, brain, seed) num_streams = 1 hidden_streams = self.create_observation_streams(num_streams, h_size, n_layers) hidden = hidden_streams[0] self.dropout_rate = tf.placeholder( dtype=tf.float32, shape=[], name="dropout_rate" ) hidden_reg = tf.layers.dropout(hidden, self.dropout_rate) if self.use_recurrent: tf.Variable( self.m_size, name="memory_size", trainable=False, dtype=tf.int32 ) self.memory_in = tf.placeholder( shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in" ) hidden_reg, self.memory_out = self.create_recurrent_encoder( hidden_reg, self.memory_in, self.sequence_length ) self.memory_out = tf.identity(self.memory_out, name="recurrent_out") if brain.vector_action_space_type == "discrete": policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden_reg, size, activation=None, use_bias=False, kernel_initializer=tf.initializers.variance_scaling(0.01), ) ) self.action_probs = tf.concat( [tf.nn.softmax(branch) for branch in policy_branches], axis=1, name="action_probs", ) self.action_masks = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks" ) self.sample_action_float, _, normalized_logits = self.create_discrete_action_masking_layer( tf.concat(policy_branches, axis=1), self.action_masks, self.act_size ) tf.identity(normalized_logits, name="action") self.sample_action = tf.cast(self.sample_action_float, tf.int32) self.true_action = tf.placeholder( shape=[None, len(policy_branches)], dtype=tf.int32, name="teacher_action", ) self.action_oh = tf.concat( [ tf.one_hot(self.true_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.loss = tf.reduce_sum( -tf.log(self.action_probs + 1e-10) * self.action_oh ) self.action_percent = tf.reduce_mean( tf.cast( tf.equal( tf.cast(tf.argmax(self.action_probs, axis=1), tf.int32), self.sample_action, ), tf.float32, ) ) else: self.policy = tf.layers.dense( hidden_reg, self.act_size[0], activation=None, use_bias=False, name="pre_action", kernel_initializer=tf.initializers.variance_scaling(0.01), ) self.clipped_sample_action = tf.clip_by_value(self.policy, -1, 1) self.sample_action = tf.identity(self.clipped_sample_action, name="action") self.true_action = tf.placeholder( shape=[None, self.act_size[0]], dtype=tf.float32, name="teacher_action" ) self.clipped_true_action = tf.clip_by_value(self.true_action, -1, 1) self.loss = tf.reduce_sum( tf.squared_difference(self.clipped_true_action, self.sample_action) ) optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.update = optimizer.minimize(self.loss)
def create_dc_actor_critic(self, h_size: int, num_layers: int, vis_encode_type: EncoderType) -> None: """ Creates Discrete control actor-critic model. :param h_size: Size of hidden linear layers. :param num_layers: Number of hidden linear layers. """ hidden_streams = self.create_observation_streams( 1, h_size, num_layers, vis_encode_type) hidden = hidden_streams[0] if self.use_recurrent: self.prev_action = tf.placeholder(shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action") prev_action_oh = tf.concat( [ tf.one_hot(self.prev_action[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) hidden = tf.concat([hidden, prev_action_oh], axis=1) self.memory_in = tf.placeholder(shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in") hidden, memory_out = self.create_recurrent_encoder( hidden, self.memory_in, self.sequence_length) self.memory_out = tf.identity(memory_out, name="recurrent_out") policy_branches = [] for size in self.act_size: policy_branches.append( tf.layers.dense( hidden, size, activation=None, use_bias=False, kernel_initializer=LearningModel.scaled_init(0.01), )) self.all_log_probs = tf.concat(policy_branches, axis=1, name="action_probs") self.action_masks = tf.placeholder(shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks") output, _, normalized_logits = self.create_discrete_action_masking_layer( self.all_log_probs, self.action_masks, self.act_size) self.output = tf.identity(output) self.normalized_logits = tf.identity(normalized_logits, name="action") self.create_value_heads(self.stream_names, hidden) self.action_holder = tf.placeholder(shape=[None, len(policy_branches)], dtype=tf.int32, name="action_holder") self.action_oh = tf.concat( [ tf.one_hot(self.action_holder[:, i], self.act_size[i]) for i in range(len(self.act_size)) ], axis=1, ) self.selected_actions = tf.stop_gradient(self.action_oh) self.all_old_log_probs = tf.placeholder( shape=[None, sum(self.act_size)], dtype=tf.float32, name="old_probabilities") _, _, old_normalized_logits = self.create_discrete_action_masking_layer( self.all_old_log_probs, self.action_masks, self.act_size) action_idx = [0] + list(np.cumsum(self.act_size)) self.entropy = tf.reduce_sum( (tf.stack( [ tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.nn.softmax( self.all_log_probs[:, action_idx[i]:action_idx[i + 1]]), logits=self.all_log_probs[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, ) self.log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=normalized_logits[:, action_idx[i]:action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, ) self.old_log_probs = tf.reduce_sum( (tf.stack( [ -tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.action_oh[:, action_idx[i]:action_idx[i + 1]], logits=old_normalized_logits[:, action_idx[i]: action_idx[i + 1]], ) for i in range(len(self.act_size)) ], axis=1, )), axis=1, keepdims=True, )