Example #1
0
    def test_ravel_index_pairs(self):
        pairs = np.array([
            [1, 6],
            [3, 3],
            [5, 2]
        ])

        for x in [
            np.random.randint(1000, size=(3, 10, 8)),
            np.random.randint(1000, size=(3, 12, 8)),
            np.random.randint(1000, size=(3, 8, 8))
        ]:
            expected_slice = x[np.arange(3), pairs[:, 0], pairs[:, 1]]

            with self.test_session():
                flat_idx = ravel_index_pairs(pairs, n_col=8).eval()
                wrong_idx = ravel_index_pairs(pairs, n_col=10).eval()

            self.assertAllEqual(x.reshape(3, -1)[np.arange(3), flat_idx], expected_slice)
            self.assertAllEqual(flat_idx, [14, 27, 42])
            assert (wrong_idx != flat_idx).sum() > 0
Example #2
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim)

        with tf.variable_scope("theta"):
            theta = self.policy(self, trainable=True).build()

        selected_spatial_action_flat = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim
        )

        selected_log_probs = self._get_select_action_probs(theta, selected_spatial_action_flat)

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)
        )

        neg_entropy_spatial = tf.reduce_sum(
            theta.spatial_action_probs * theta.spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            theta.action_id_probs * theta.action_id_log_probs, axis=1
        ))

        if self.mode == ACMode.PPO:
            # could also use stop_gradient and forget about the trainable
            with tf.variable_scope("theta_old"):
                theta_old = self.policy(self, trainable=False).build()

            new_theta_var = tf.global_variables("theta/")
            old_theta_var = tf.global_variables("theta_old/")

            assert len(tf.trainable_variables("theta/")) == len(new_theta_var)
            assert not tf.trainable_variables("theta_old/")
            assert len(old_theta_var) == len(new_theta_var)

            self.update_theta_op = [
                tf.assign(t_old, t_new) for t_new, t_old in zip(new_theta_var, old_theta_var)
            ]

            selected_log_probs_old = self._get_select_action_probs(
                theta_old, selected_spatial_action_flat
            )
            ratio = tf.exp(selected_log_probs.total - selected_log_probs_old.total)
            clipped_ratio = tf.clip_by_value(
                ratio, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon
            )
            l_clip = tf.minimum(
                ratio * self.placeholders.advantage,
                clipped_ratio * self.placeholders.advantage
            )
            self.sampled_action_id = weighted_random_sample(theta_old.action_id_probs)
            self.sampled_spatial_action = weighted_random_sample(theta_old.spatial_action_probs)
            self.value_estimate = theta_old.value_estimate
            self._scalar_summary("action/ratio", tf.reduce_mean(clipped_ratio))
            self._scalar_summary("action/ratio_is_clipped",
                tf.reduce_mean(tf.to_float(tf.equal(ratio, clipped_ratio))))
            policy_loss = -tf.reduce_mean(l_clip)
        else:
            self.sampled_action_id = weighted_random_sample(theta.action_id_probs)
            self.sampled_spatial_action = weighted_random_sample(theta.spatial_action_probs)
            self.value_estimate = theta.value_estimate
            policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, theta.value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate))
        self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target))
        self._scalar_summary("action/is_spatial_action_available",
            tf.reduce_mean(self.placeholders.is_spatial_action_available))
        self._scalar_summary("action/selected_id_log_prob",
            tf.reduce_mean(selected_log_probs.action_id))
        self._scalar_summary("loss/policy", policy_loss)
        self._scalar_summary("loss/value", value_loss)
        self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial)
        self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage))
        self._scalar_summary("action/selected_total_log_prob",
            tf.reduce_mean(selected_log_probs.total))
        self._scalar_summary("action/selected_spatial_log_prob",
            tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available)

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
Example #3
0
    def build_model(self):
        self.placeholders = _get_placeholders(self.spatial_dim)
        with tf.variable_scope("theta"):
            units_embedded = layers.embed_sequence(
                self.placeholders.screen_unit_type,
                vocab_size=SCREEN_FEATURES.unit_type.scale,
                embed_dim=self.unit_type_emb_dim,
                scope="unit_type_emb",
                trainable=self.trainable
            )

            # Let's not one-hot zero which is background
            player_relative_screen_one_hot = layers.one_hot_encoding(
                self.placeholders.player_relative_screen,
                num_classes=SCREEN_FEATURES.player_relative.scale
            )[:, :, :, 1:]
            player_relative_minimap_one_hot = layers.one_hot_encoding(
                self.placeholders.player_relative_minimap,
                num_classes=MINIMAP_FEATURES.player_relative.scale
            )[:, :, :, 1:]

            channel_axis = 3
            screen_numeric_all = tf.concat(
                [self.placeholders.screen_numeric, units_embedded, player_relative_screen_one_hot],
                axis=channel_axis
            )
            minimap_numeric_all = tf.concat(
                [self.placeholders.minimap_numeric, player_relative_minimap_one_hot],
                axis=channel_axis
            )

            # BUILD CONVNNs
            screen_output = self._build_convs(screen_numeric_all, "screen_network")
            minimap_output = self._build_convs(minimap_numeric_all, "minimap_network")


            # State representation (last layer before separation as described in the paper)
            self.map_output = tf.concat([screen_output, minimap_output], axis=channel_axis)

            # BUILD CONVLSTM
            self.rnn_in = tf.reshape(self.map_output, [1, -1, 32, 32, 64])
            self.cell = tf.contrib.rnn.Conv2DLSTMCell(input_shape=[32, 32, 1], # input dims
                                                 kernel_shape=[3, 3],  # for a 3 by 3 conv
                                                 output_channels=64)  # number of feature maps
            c_init = np.zeros((1, 32, 32, 64), np.float32)
            h_init = np.zeros((1, 32, 32, 64), np.float32)
            self.state_init = [c_init, h_init]
            step_size = tf.shape(self.map_output)[:1] # Get step_size from input dimensions
            c_in = tf.placeholder(tf.float32, [None, 32, 32, 64])
            h_in = tf.placeholder(tf.float32, [None, 32, 32, 64])
            self.state_in = (c_in, h_in)
            state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
            self.step_size = tf.placeholder(tf.float32, [1])
            (self.outputs, self.state) = tf.nn.dynamic_rnn(self.cell, self.rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False,
                                                          dtype=tf.float32)
            lstm_c, lstm_h = self.state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(self.outputs, [-1, 32, 32, 64])
            
            # 1x1 conv layer to generate our spatial policy
            self.spatial_action_logits = layers.conv2d(
                rnn_out,
                data_format="NHWC",
                num_outputs=1,
                kernel_size=1,
                stride=1,
                activation_fn=None,
                scope='spatial_action',
                trainable=self.trainable
            )

            spatial_action_probs = tf.nn.softmax(layers.flatten(self.spatial_action_logits))


            map_output_flat = tf.reshape(self.outputs, [-1, 65536])  # (32*32*64)
            # fully connected layer for Value predictions and action_id
            self.fc1 = layers.fully_connected(
                map_output_flat,
                num_outputs=256,
                activation_fn=tf.nn.relu,
                scope="fc1",
                trainable=self.trainable
            )
            # fc/action_id
            action_id_probs = layers.fully_connected(
                self.fc1,
                num_outputs=len(actions.FUNCTIONS),
                activation_fn=tf.nn.softmax,
                scope="action_id",
                trainable=self.trainable
            )
            # fc/value
            self.value_estimate = tf.squeeze(layers.fully_connected(
                self.fc1,
                num_outputs=1,
                activation_fn=None,
                scope='value',
                trainable=self.trainable
            ), axis=1)

            # disregard non-allowed actions by setting zero prob and re-normalizing to 1 ((MINE) THE MASK)
            action_id_probs *= self.placeholders.available_action_ids
            action_id_probs /= tf.reduce_sum(action_id_probs, axis=1, keepdims=True)

            def logclip(x):
                return tf.log(tf.clip_by_value(x, 1e-12, 1.0))

            spatial_action_log_probs = (
                    logclip(spatial_action_probs)
                    * tf.expand_dims(self.placeholders.is_spatial_action_available, axis=1)
            )
            # non-available actions get log(1e-10) value but that's ok because it's never used
            action_id_log_probs = logclip(action_id_probs)

            self.value_estimate = self.value_estimate
            self.action_id_probs = action_id_probs
            self.spatial_action_probs = spatial_action_probs
            self.action_id_log_probs = action_id_log_probs
            self.spatial_action_log_probs = spatial_action_log_probs

        selected_spatial_action_flat = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim
        )

        selected_log_probs = self._get_select_action_probs(selected_spatial_action_flat)

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.placeholders.is_spatial_action_available)
        )

        neg_entropy_spatial = tf.reduce_sum(
            self.spatial_action_probs * self.spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            self.action_id_probs * self.action_id_log_probs, axis=1
        ))
        
        # Sample now actions from the corresponding dstrs defined by the policy network theta
        self.sampled_action_id = weighted_random_sample(self.action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(self.spatial_action_probs)
        
        self.value_estimate = self.value_estimate
        policy_loss = -tf.reduce_mean(selected_log_probs.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, self.value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self._scalar_summary("value/estimate", tf.reduce_mean(self.value_estimate))
        self._scalar_summary("value/target", tf.reduce_mean(self.placeholders.value_target))
        self._scalar_summary("action/is_spatial_action_available",
                             tf.reduce_mean(self.placeholders.is_spatial_action_available))
        self._scalar_summary("action/selected_id_log_prob",
                             tf.reduce_mean(selected_log_probs.action_id))
        self._scalar_summary("loss/policy", policy_loss)
        self._scalar_summary("loss/value", value_loss)
        self._scalar_summary("loss/neg_entropy_spatial", neg_entropy_spatial)
        self._scalar_summary("loss/neg_entropy_action_id", neg_entropy_action_id)
        self._scalar_summary("loss/total", loss)
        self._scalar_summary("value/advantage", tf.reduce_mean(self.placeholders.advantage))
        self._scalar_summary("action/selected_total_log_prob",
                             tf.reduce_mean(selected_log_probs.total))
        self._scalar_summary("action/selected_spatial_log_prob",
                             tf.reduce_sum(selected_log_probs.spatial) / sum_spatial_action_available)

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(self._scalar_summary_key))
Example #4
0
    def build_model(self):
        self._define_input_placeholders()

        spatial_action_probs, action_id_probs, value_estimate = \
            self._build_fullyconv_network()

        selected_spatial_action_flat = ravel_index_pairs(
            self.ph_selected_spatial_action, self.spatial_dim
        )

        def logclip(x):
            return tf.log(tf.clip_by_value(x, 1e-12, 1.0))

        spatial_action_log_probs = (
            logclip(spatial_action_probs)
            * tf.expand_dims(self.ph_is_spatial_action_available, axis=1)
        )

        # non-available actions get log(1e-10) value but that's ok because it's never used        
        action_id_log_probs = logclip(action_id_probs)

        selected_spatial_action_log_prob = select_from_each_row(
            spatial_action_log_probs, selected_spatial_action_flat
        )
        selected_action_id_log_prob = select_from_each_row(
            action_id_log_probs, self.ph_selected_action_id
        )
        selected_action_total_log_prob = (
            selected_spatial_action_log_prob
            + selected_action_id_log_prob
        )

        # maximum is to avoid 0 / 0 because this is used to calculate some means
        sum_spatial_action_available = tf.maximum(
            1e-10, tf.reduce_sum(self.ph_is_spatial_action_available)
        )
        neg_entropy_spatial = tf.reduce_sum(
            spatial_action_probs * spatial_action_log_probs
        ) / sum_spatial_action_available
        neg_entropy_action_id = tf.reduce_mean(tf.reduce_sum(
            action_id_probs * action_id_log_probs, axis=1
        ))

        advantage = tf.stop_gradient(self.ph_value_target - value_estimate)
        policy_loss = -tf.reduce_mean(selected_action_total_log_prob * advantage)
        value_loss = tf.losses.mean_squared_error(self.ph_value_target, value_estimate)

        loss = (
            policy_loss
            + value_loss * self.loss_value_weight
            + neg_entropy_spatial * self.entropy_weight_spatial
            + neg_entropy_action_id * self.entropy_weight_action_id
        )

        scalar_summary_collection_name = "scalar_summaries"
        s_collections = [scalar_summary_collection_name, tf.GraphKeys.SUMMARIES]
        tf.summary.scalar("loss/policy", policy_loss, collections=s_collections)
        tf.summary.scalar("loss/value", value_loss, s_collections)
        tf.summary.scalar("loss/neg_entropy_spatial", neg_entropy_spatial, s_collections)
        tf.summary.scalar("loss/neg_entropy_action_id", neg_entropy_action_id, s_collections)
        tf.summary.scalar("loss/total", loss, s_collections)
        tf.summary.scalar("value/advantage", tf.reduce_mean(advantage), s_collections)
        tf.summary.scalar("value/estimate", tf.reduce_mean(value_estimate), s_collections)
        tf.summary.scalar("value/target", tf.reduce_mean(self.ph_value_target), s_collections)
        tf.summary.scalar("action/is_spatial_action_available",
            tf.reduce_mean(self.ph_is_spatial_action_available), s_collections)
        tf.summary.scalar("action/is_spatial_action_available",
            tf.reduce_mean(self.ph_is_spatial_action_available), s_collections)
        tf.summary.scalar("action/selected_id_log_prob",
            tf.reduce_mean(selected_action_id_log_prob))
        tf.summary.scalar("action/selected_total_log_prob",
            tf.reduce_mean(selected_action_total_log_prob))
        tf.summary.scalar("action/selected_spatial_log_prob",
            tf.reduce_sum(selected_spatial_action_log_prob) / sum_spatial_action_available
        )

        self.sampled_action_id = weighted_random_sample(action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(spatial_action_probs)
        self.value_estimate = value_estimate

        self.train_op = layers.optimize_loss(
            loss=loss,
            global_step=framework.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_op"
        )

        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(tf.get_collection(scalar_summary_collection_name))
Example #5
0
    def build_model(self):
        """build_model

        Function that actually builds the model, initialising
        variables and setting up the policy.
        After this, it sets up the loss value, defines a training
        step and sets up logging for all needed values.
        """

        # Initialise the placeholders property with some default values.
        self.placeholders = get_default_values(self.spatial_dim)

        # Provides checks to ensure that variable isn't shared by accident,
        # and starts up the fully convolutional policy.
        with tf.variable_scope("theta"):
            theta = self.policy(self,
                                trainable=True,
                                spatial_dim=self.spatial_dim).build()

        # Get the actions and the probabilities of those actions.
        selected_spatial_action = ravel_index_pairs(
            self.placeholders.selected_spatial_action, self.spatial_dim)

        selected_log_probabilities = self.get_selected_action_probability(
            theta, selected_spatial_action)

        # Take the maximum here to avoid a divide by 0 error next.
        sum_of_available_spatial = tf.maximum(
            1e-10,
            tf.reduce_sum(self.placeholders.is_spatial_action_available))

        # Generate the negative entropy, used later as part of the loss
        # function. This in-turn is used to optimise to get the lowest
        # loss possible.
        negative_spatial_entropy = tf.reduce_sum(
            theta.spatial_action_probs * theta.spatial_action_log_probs)

        negative_spatial_entropy /= sum_of_available_spatial

        negative_entropy_for_action_id = tf.reduce_mean(
            tf.reduce_sum(theta.action_id_probs * theta.action_id_log_probs,
                          axis=1))

        # Get the values for the possible actions.
        self.sampled_action_id = weighted_random_sample(theta.action_id_probs)
        self.sampled_spatial_action = weighted_random_sample(
            theta.spatial_action_probs)

        self.value_estimate = theta.value_estimate

        # Calculate the policy and value loss, such that the final loss
        # can be calculated and optimised against.
        policy_loss = -tf.reduce_mean(
            selected_log_probabilities.total * self.placeholders.advantage)

        value_loss = tf.losses.mean_squared_error(
            self.placeholders.value_target, theta.value_estimate)

        total_loss = (
            policy_loss + value_loss * self.loss_value_weight +
            negative_spatial_entropy * self.entropy_weight_spatial +
            negative_entropy_for_action_id * self.entropy_weight_action_id)

        # Define a training step to be optimising the loss to be the lowest.
        self.train_operation = layers.optimize_loss(
            loss=total_loss,
            global_step=tf.train.get_global_step(),
            optimizer=self.optimiser,
            clip_gradients=self.max_gradient_norm,
            summaries=OPTIMIZER_SUMMARIES,
            learning_rate=None,
            name="train_operation")

        # Finally, log some information about the model in its current state.
        self.get_scalar_summary("Value - Estimate:",
                                tf.reduce_mean(self.value_estimate))

        self.get_scalar_summary("Value - Target:",
                                tf.reduce_mean(self.placeholders.value_target))

        self.get_scalar_summary(
            "Action - Is Spatial Action Available:",
            tf.reduce_mean(self.placeholders.is_spatial_action_available))

        self.get_scalar_summary(
            "Action - Selected Action ID Log Probability",
            tf.reduce_mean(selected_log_probabilities.action_id))

        self.get_scalar_summary("Loss - Policy Loss", policy_loss)
        self.get_scalar_summary("Loss - Value Loss", value_loss)
        self.get_scalar_summary("Loss - Negative Spatial Entropy",
                                negative_spatial_entropy)
        self.get_scalar_summary("Loss - Negative Entropy for Action ID",
                                negative_entropy_for_action_id)

        self.get_scalar_summary("Loss - Total", total_loss)
        self.get_scalar_summary("Value - Advantage",
                                tf.reduce_mean(self.placeholders.advantage))

        self.get_scalar_summary(
            "Action - Selected Total Log Probability",
            tf.reduce_mean(selected_log_probabilities.total))

        self.get_scalar_summary(
            "Action - Selected Spatial Action Log Probability",
            tf.reduce_sum(selected_log_probabilities.spatial) /
            sum_of_available_spatial)

        # Clean up and save.
        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(max_to_keep=2)
        self.all_summary_op = tf.summary.merge_all(tf.GraphKeys.SUMMARIES)
        self.scalar_summary_op = tf.summary.merge(
            tf.get_collection(self._scalar_summary_key))