def test_ravel_index_pairs(self): pairs = np.array([ [1, 6], [3, 3], [5, 2] ]) for x in [ np.random.randint(1000, size=(3, 10, 8)), np.random.randint(1000, size=(3, 12, 8)), np.random.randint(1000, size=(3, 8, 8)) ]: expected_slice = x[np.arange(3), pairs[:, 0], pairs[:, 1]] with self.test_session(): flat_idx = ravel_index_pairs(pairs, n_col=8).eval() wrong_idx = ravel_index_pairs(pairs, n_col=10).eval() self.assertAllEqual(x.reshape(3, -1)[np.arange(3), flat_idx], expected_slice) self.assertAllEqual(flat_idx, [14, 27, 42]) assert (wrong_idx != flat_idx).sum() > 0
def build_model(self): self.placeholders = _get_placeholders(self.spatial_dim) with tf.variable_scope("theta"): theta = self.policy(self, trainable=True).build() selected_spatial_action_flat = ravel_index_pairs( self.placeholders.selected_spatial_action, self.spatial_dim ) selected_log_probs = self._get_select_action_probs(theta, selected_spatial_action_flat) # maximum is to avoid 0 / 0 because this is used to calculate some means sum_spatial_action_available = tf.maximum( 1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available) ) neg_entropy_spatial = tf.reduce_sum( theta.spatial_action_probs * theta.spatial_action_log_probs ) / sum_spatial_action_available neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum( theta.action_id_probs * theta.action_id_log_probs, axis=1 )) if self.mode == ACMode.PPO: # could also use stop_gradient and forget about the trainable with tf.variable_scope("theta_old"): theta_old = self.policy(self, trainable=False).build() new_theta_var = tf.global_variables("theta/") old_theta_var = tf.global_variables("theta_old/") assert len(tf.trainable_variables("theta/")) == len(new_theta_var) assert not tf.trainable_variables("theta_old/") assert len(old_theta_var) == len(new_theta_var) self.update_theta_op = [ tf.assign(t_old, t_new) for t_new, t_old in zip(new_theta_var, old_theta_var) ] selected_log_probs_old = self._get_select_action_probs( theta_old, selected_spatial_action_flat ) ratio = tf.exp(selected_log_probs.total - selected_log_probs_old.total) clipped_ratio = tf.clip_by_value( ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon ) l_clip = tf.minimum( ratio * self.placeholders.advantage, clipped_ratio * self.placeholders.advantage ) self.sampled_action_id = weighted_random_sample(theta_old.action_id_probs) self.sampled_spatial_action = weighted_random_sample(theta_old.spatial_action_probs) self.value_estimate = theta_old.value_estimate self._scalar_summary("action/ratio", tf.reduce_mean(clipped_ratio)) self._scalar_summary("action/ratio_is_clipped", tf.reduce_mean(tf.to_float(tf.equal(ratio, clipped_ratio)))) policy_loss = -tf.reduce_mean(l_clip) else: self.sampled_action_id = weighted_random_sample(theta.action_id_probs) self.sampled_spatial_action = weighted_random_sample(theta.spatial_action_probs) self.value_estimate = theta.value_estimate policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage) value_loss = tf.losses.mean_squared_error( self.placeholders.value_target, theta.value_estimate) loss = ( policy_loss + value_loss * self.loss_value_weight + neg_entropy_spatial * self.entropy_weight_spatial + neg_entropy_action_id * self.entropy_weight_action_id ) self.train_op = layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), optimizer=self.optimiser, clip_gradients=self.max_gradient_norm, summaries=OPTIMIZER_SUMMARIES, learning_rate=None, name="train_op" ) self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate)) self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target)) self._scalar_summary("action/is_spatial_action_available", tf.reduce_mean(self.placeholders.is_spatial_action_available)) self._scalar_summary("action/selected_id_log_prob", tf.reduce_mean(selected_log_probs.action_id)) self._scalar_summary("loss/policy", policy_loss) self._scalar_summary("loss/value", value_loss) self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial) self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id) self._scalar_summary("loss/total", loss) self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage)) self._scalar_summary("action/selected_total_log_prob", tf.reduce_mean(selected_log_probs.total)) self._scalar_summary("action/selected_spatial_log_prob", tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available) self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=2) self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
def build_model(self): self.placeholders = _get_placeholders(self.spatial_dim) with tf.variable_scope("theta"): units_embedded = layers.embed_sequence( self.placeholders.screen_unit_type, vocab_size=SCREEN_FEATURES.unit_type.scale, embed_dim=self.unit_type_emb_dim, scope="unit_type_emb", trainable=self.trainable ) # Let's not one-hot zero which is background player_relative_screen_one_hot = layers.one_hot_encoding( self.placeholders.player_relative_screen, num_classes=SCREEN_FEATURES.player_relative.scale )[:, :, :, 1:] player_relative_minimap_one_hot = layers.one_hot_encoding( self.placeholders.player_relative_minimap, num_classes=MINIMAP_FEATURES.player_relative.scale )[:, :, :, 1:] channel_axis = 3 screen_numeric_all = tf.concat( [self.placeholders.screen_numeric, units_embedded, player_relative_screen_one_hot], axis=channel_axis ) minimap_numeric_all = tf.concat( [self.placeholders.minimap_numeric, player_relative_minimap_one_hot], axis=channel_axis ) # BUILD CONVNNs screen_output = self._build_convs(screen_numeric_all, "screen_network") minimap_output = self._build_convs(minimap_numeric_all, "minimap_network") # State representation (last layer before separation as described in the paper) self.map_output = tf.concat([screen_output, minimap_output], axis=channel_axis) # BUILD CONVLSTM self.rnn_in = tf.reshape(self.map_output, [1, -1, 32, 32, 64]) self.cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=[32, 32, 1], # input dims kernel_shape=[3, 3], # for a 3 by 3 conv output_channels=64) # number of feature maps c_init = np.zeros((1, 32, 32, 64), np.float32) h_init = np.zeros((1, 32, 32, 64), np.float32) self.state_init = [c_init, h_init] step_size = tf.shape(self.map_output)[:1] # Get step_size from input dimensions c_in = tf.placeholder(tf.float32, [None, 32, 32, 64]) h_in = tf.placeholder(tf.float32, [None, 32, 32, 64]) self.state_in = (c_in, h_in) state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in) self.step_size = tf.placeholder(tf.float32, [1]) (self.outputs, self.state) = tf.nn.dynamic_rnn(self.cell, self.rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False, dtype=tf.float32) lstm_c, lstm_h = self.state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) rnn_out = tf.reshape(self.outputs, [-1, 32, 32, 64]) # 1x1 conv layer to generate our spatial policy self.spatial_action_logits = layers.conv2d( rnn_out, data_format="NHWC", num_outputs=1, kernel_size=1, stride=1, activation_fn=None, scope='spatial_action', trainable=self.trainable ) spatial_action_probs = tf.nn.softmax(layers.flatten(self.spatial_action_logits)) map_output_flat = tf.reshape(self.outputs, [-1, 65536]) # (32*32*64) # fully connected layer for Value predictions and action_id self.fc1 = layers.fully_connected( map_output_flat, num_outputs=256, activation_fn=tf.nn.relu, scope="fc1", trainable=self.trainable ) # fc/action_id action_id_probs = layers.fully_connected( self.fc1, num_outputs=len(actions.FUNCTIONS), activation_fn=tf.nn.softmax, scope="action_id", trainable=self.trainable ) # fc/value self.value_estimate = tf.squeeze(layers.fully_connected( self.fc1, num_outputs=1, activation_fn=None, scope='value', trainable=self.trainable ), axis=1) # disregard non-allowed actions by setting zero prob and re-normalizing to 1 ((MINE) THE MASK) action_id_probs *= self.placeholders.available_action_ids action_id_probs /= tf.reduce_sum(action_id_probs, axis=1, keepdims=True) def logclip(x): return tf.log(tf.clip_by_value(x, 1e-12, 1.0)) spatial_action_log_probs = ( logclip(spatial_action_probs) * tf.expand_dims(self.placeholders.is_spatial_action_available, axis=1) ) # non-available actions get log(1e-10) value but that's ok because it's never used action_id_log_probs = logclip(action_id_probs) self.value_estimate = self.value_estimate self.action_id_probs = action_id_probs self.spatial_action_probs = spatial_action_probs self.action_id_log_probs = action_id_log_probs self.spatial_action_log_probs = spatial_action_log_probs selected_spatial_action_flat = ravel_index_pairs( self.placeholders.selected_spatial_action, self.spatial_dim ) selected_log_probs = self._get_select_action_probs(selected_spatial_action_flat) # maximum is to avoid 0 / 0 because this is used to calculate some means sum_spatial_action_available = tf.maximum( 1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available) ) neg_entropy_spatial = tf.reduce_sum( self.spatial_action_probs * self.spatial_action_log_probs ) / sum_spatial_action_available neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum( self.action_id_probs * self.action_id_log_probs, axis=1 )) # Sample now actions from the corresponding dstrs defined by the policy network theta self.sampled_action_id = weighted_random_sample(self.action_id_probs) self.sampled_spatial_action = weighted_random_sample(self.spatial_action_probs) self.value_estimate = self.value_estimate policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage) value_loss = tf.losses.mean_squared_error( self.placeholders.value_target, self.value_estimate) loss = ( policy_loss + value_loss * self.loss_value_weight + neg_entropy_spatial * self.entropy_weight_spatial + neg_entropy_action_id * self.entropy_weight_action_id ) self.train_op = layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), optimizer=self.optimiser, clip_gradients=self.max_gradient_norm, summaries=OPTIMIZER_SUMMARIES, learning_rate=None, name="train_op" ) self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate)) self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target)) self._scalar_summary("action/is_spatial_action_available", tf.reduce_mean(self.placeholders.is_spatial_action_available)) self._scalar_summary("action/selected_id_log_prob", tf.reduce_mean(selected_log_probs.action_id)) self._scalar_summary("loss/policy", policy_loss) self._scalar_summary("loss/value", value_loss) self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial) self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id) self._scalar_summary("loss/total", loss) self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage)) self._scalar_summary("action/selected_total_log_prob", tf.reduce_mean(selected_log_probs.total)) self._scalar_summary("action/selected_spatial_log_prob", tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available) self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=2) self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
def build_model(self): self._define_input_placeholders() spatial_action_probs, action_id_probs, value_estimate = \ self._build_fullyconv_network() selected_spatial_action_flat = ravel_index_pairs( self.ph_selected_spatial_action, self.spatial_dim ) def logclip(x): return tf.log(tf.clip_by_value(x, 1e-12, 1.0)) spatial_action_log_probs = ( logclip(spatial_action_probs) * tf.expand_dims(self.ph_is_spatial_action_available, axis=1) ) # non-available actions get log(1e-10) value but that's ok because it's never used action_id_log_probs = logclip(action_id_probs) selected_spatial_action_log_prob = select_from_each_row( spatial_action_log_probs, selected_spatial_action_flat ) selected_action_id_log_prob = select_from_each_row( action_id_log_probs, self.ph_selected_action_id ) selected_action_total_log_prob = ( selected_spatial_action_log_prob + selected_action_id_log_prob ) # maximum is to avoid 0 / 0 because this is used to calculate some means sum_spatial_action_available = tf.maximum( 1e-10, tf.reduce_sum(self.ph_is_spatial_action_available) ) neg_entropy_spatial = tf.reduce_sum( spatial_action_probs * spatial_action_log_probs ) / sum_spatial_action_available neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum( action_id_probs * action_id_log_probs, axis=1 )) advantage = tf.stop_gradient(self.ph_value_target - value_estimate) policy_loss = -tf.reduce_mean(selected_action_total_log_prob * advantage) value_loss = tf.losses.mean_squared_error(self.ph_value_target, value_estimate) loss = ( policy_loss + value_loss * self.loss_value_weight + neg_entropy_spatial * self.entropy_weight_spatial + neg_entropy_action_id * self.entropy_weight_action_id ) scalar_summary_collection_name = "scalar_summaries" s_collections = [scalar_summary_collection_name, tf.GraphKeys.SUMMARIES] tf.summary.scalar("loss/policy", policy_loss, collections=s_collections) tf.summary.scalar("loss/value", value_loss, s_collections) tf.summary.scalar("loss/neg_entropy_spatial", neg_entropy_spatial, s_collections) tf.summary.scalar("loss/neg_entropy_action_id", neg_entropy_action_id, s_collections) tf.summary.scalar("loss/total", loss, s_collections) tf.summary.scalar("value/advantage", tf.reduce_mean(advantage), s_collections) tf.summary.scalar("value/estimate", tf.reduce_mean(value_estimate), s_collections) tf.summary.scalar("value/target", tf.reduce_mean(self.ph_value_target), s_collections) tf.summary.scalar("action/is_spatial_action_available", tf.reduce_mean(self.ph_is_spatial_action_available), s_collections) tf.summary.scalar("action/is_spatial_action_available", tf.reduce_mean(self.ph_is_spatial_action_available), s_collections) tf.summary.scalar("action/selected_id_log_prob", tf.reduce_mean(selected_action_id_log_prob)) tf.summary.scalar("action/selected_total_log_prob", tf.reduce_mean(selected_action_total_log_prob)) tf.summary.scalar("action/selected_spatial_log_prob", tf.reduce_sum(selected_spatial_action_log_prob) / sum_spatial_action_available ) self.sampled_action_id = weighted_random_sample(action_id_probs) self.sampled_spatial_action = weighted_random_sample(spatial_action_probs) self.value_estimate = value_estimate self.train_op = layers.optimize_loss( loss=loss, global_step=framework.get_global_step(), optimizer=self.optimiser, clip_gradients=self.max_gradient_norm, summaries=OPTIMIZER_SUMMARIES, learning_rate=None, name="train_op" ) self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=2) self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) self.scalar_summary_op = tf.summary.merge(tf.get_collection(scalar_summary_collection_name))
def build_model(self): """build_model Function that actually builds the model, initialising variables and setting up the policy. After this, it sets up the loss value, defines a training step and sets up logging for all needed values. """ # Initialise the placeholders property with some default values. self.placeholders = get_default_values(self.spatial_dim) # Provides checks to ensure that variable isn't shared by accident, # and starts up the fully convolutional policy. with tf.variable_scope("theta"): theta = self.policy(self, trainable=True, spatial_dim=self.spatial_dim).build() # Get the actions and the probabilities of those actions. selected_spatial_action = ravel_index_pairs( self.placeholders.selected_spatial_action, self.spatial_dim) selected_log_probabilities = self.get_selected_action_probability( theta, selected_spatial_action) # Take the maximum here to avoid a divide by 0 error next. sum_of_available_spatial = tf.maximum( 1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)) # Generate the negative entropy, used later as part of the loss # function. This in-turn is used to optimise to get the lowest # loss possible. negative_spatial_entropy = tf.reduce_sum( theta.spatial_action_probs * theta.spatial_action_log_probs) negative_spatial_entropy /= sum_of_available_spatial negative_entropy_for_action_id = tf.reduce_mean( tf.reduce_sum(theta.action_id_probs * theta.action_id_log_probs, axis=1)) # Get the values for the possible actions. self.sampled_action_id = weighted_random_sample(theta.action_id_probs) self.sampled_spatial_action = weighted_random_sample( theta.spatial_action_probs) self.value_estimate = theta.value_estimate # Calculate the policy and value loss, such that the final loss # can be calculated and optimised against. policy_loss = -tf.reduce_mean( selected_log_probabilities.total * self.placeholders.advantage) value_loss = tf.losses.mean_squared_error( self.placeholders.value_target, theta.value_estimate) total_loss = ( policy_loss + value_loss * self.loss_value_weight + negative_spatial_entropy * self.entropy_weight_spatial + negative_entropy_for_action_id * self.entropy_weight_action_id) # Define a training step to be optimising the loss to be the lowest. self.train_operation = layers.optimize_loss( loss=total_loss, global_step=tf.train.get_global_step(), optimizer=self.optimiser, clip_gradients=self.max_gradient_norm, summaries=OPTIMIZER_SUMMARIES, learning_rate=None, name="train_operation") # Finally, log some information about the model in its current state. self.get_scalar_summary("Value - Estimate:", tf.reduce_mean(self.value_estimate)) self.get_scalar_summary("Value - Target:", tf.reduce_mean(self.placeholders.value_target)) self.get_scalar_summary( "Action - Is Spatial Action Available:", tf.reduce_mean(self.placeholders.is_spatial_action_available)) self.get_scalar_summary( "Action - Selected Action ID Log Probability", tf.reduce_mean(selected_log_probabilities.action_id)) self.get_scalar_summary("Loss - Policy Loss", policy_loss) self.get_scalar_summary("Loss - Value Loss", value_loss) self.get_scalar_summary("Loss - Negative Spatial Entropy", negative_spatial_entropy) self.get_scalar_summary("Loss - Negative Entropy for Action ID", negative_entropy_for_action_id) self.get_scalar_summary("Loss - Total", total_loss) self.get_scalar_summary("Value - Advantage", tf.reduce_mean(self.placeholders.advantage)) self.get_scalar_summary( "Action - Selected Total Log Probability", tf.reduce_mean(selected_log_probabilities.total)) self.get_scalar_summary( "Action - Selected Spatial Action Log Probability", tf.reduce_sum(selected_log_probabilities.spatial) / sum_of_available_spatial) # Clean up and save. self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver(max_to_keep=2) self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES) self.scalar_summary_op = tf.summary.merge( tf.get_collection(self._scalar_summary_key))