Esempio n. 1
0
    def _encode_board(self, board_state, name, reuse=None):
        """ Encodes a board state or prev orders state
            :param board_state: The board state / prev orders state to encode - (batch, NB_NODES, initial_features)
            :param name: The name to use for the encoding
            :param reuse: Whether to reuse or not the weights from another encoding operation
            :return: The encoded board state / prev_orders state
        """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import GraphConvolution, preprocess_adjacency

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        relu = tf.nn.relu

        # Computing norm adjacency
        norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
        norm_adjacency = tf.tile(tf.expand_dims(norm_adjacency, axis=0), [tf.shape(board_state)[0], 1, 1])

        # Building scope
        scope = tf.VariableScope(name='policy/%s' % name, reuse=reuse)
        with tf.variable_scope(scope):
            batch_size = tf.shape(board_state)[0]

            # Adding noise to break symmetry
            board_state = board_state + tf.random_normal(tf.shape(board_state), stddev=0.01)

            # Projecting (if needed) to 'gcn_size'
            if board_state.shape[-1].value == NB_FEATURES:
                with tf.variable_scope('proj', reuse=tf.AUTO_REUSE):
                    proj_w = tf.get_variable('W', shape=[1, NB_FEATURES, hps('gcn_size')], dtype=tf.float32)
                graph_conv = relu(tf.matmul(board_state, tf.tile(proj_w, [batch_size, 1, 1])))
            else:
                graph_conv = board_state

            # First and intermediate layers
            for _ in range(hps('nb_graph_conv') - 1):
                graph_conv = GraphConvolution(input_dim=hps('gcn_size'),                    # (b, NB_NODES, gcn_size)
                                              output_dim=hps('gcn_size'),
                                              norm_adjacency=norm_adjacency,
                                              activation_fn=relu,
                                              residual=True,
                                              bias=True)(graph_conv)

            # Last Layer
            graph_conv = GraphConvolution(input_dim=hps('gcn_size'),                        # (b, NB_NODES, final_size)
                                          output_dim=hps('attn_size') // 2,
                                          norm_adjacency=norm_adjacency,
                                          activation_fn=relu,
                                          residual=False,
                                          bias=True)(graph_conv)

        # Returning
        return graph_conv
Esempio n. 2
0
    def _build_value_initial(self):
        """ Builds the value model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.utils.tensorflow import to_float

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('value', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                board_state = to_float(
                    self.features['board_state']
                )  # tf.float32 - (b, NB_NODES, NB_FEATURES)
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                value_target = self.features[
                    'value_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Computing value for the current power
                state_value = self.get_board_value(board_state, current_power)

                # Computing value loss
                with tf.variable_scope('value_loss'):
                    value_loss = tf.reduce_mean(
                        tf.square(value_target - state_value))
                    value_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(value_loss),  # pylint: disable=cell-var-from-loop
                        lambda: value_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/value/v001_val_relu_7': True,
            'state_value': state_value,
            'value_loss': value_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 3
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 norm_adjacency,
                 activation_fn=tf.nn.relu,
                 residual=False,
                 bias=False,
                 scope=None,
                 reuse=None):
        """ Initializes the graph convolutional network
            :param input_dim: The number of features per node in the input
            :param output_dim: The number of features per node desired in the output
            :param norm_adjacency: [PLACEHOLDER] The sparse normalized adjacency matrix (NxN matrix)
            :param activation_fn: The activation function to use after the graph convolution
            :param residual: Use residual connection or not.
            :param bias: Boolean flag that indicates we also want to include a bias term
            :param scope: Optional. The scope to use for this layer
            :param reuse: Optional. Boolean. Whether or not the layer and its variables should be reused.
        """
        self.activation_fn = activation_fn if activation_fn is not None else lambda x: x
        self.norm_adjacency = norm_adjacency
        self.bias = bias
        self.var_w, self.var_b = None, None
        self.residual = residual

        # Initializing variables
        with tf.variable_scope(scope, 'GraphConv', reuse=reuse):
            self.var_w = he('W', [NB_NODES, input_dim, output_dim])
            if self.bias:
                self.var_b = zeros('b', [output_dim])
Esempio n. 4
0
    def _encode_board(self, board_state, name, reuse=None):
        """ Encodes a board state or prev orders state
            :param board_state: The board state / prev orders state to encode - (batch, NB_NODES, initial_features)
            :param name: The name to use for the encoding
            :param reuse: Whether to reuse or not the weights from another encoding operation
            :return: The encoded board state / prev_orders state
        """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import film_gcn_res_block, preprocess_adjacency

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu

        # Getting film gammas and betas
        film_gammas = self.outputs['_%s_film_gammas' % name]
        film_betas = self.outputs['_%s_film_betas' % name]

        # Computing norm adjacency
        norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
        norm_adjacency = tf.tile(tf.expand_dims(norm_adjacency, axis=0),
                                 [tf.shape(board_state)[0], 1, 1])

        # Building scope
        scope = tf.VariableScope(name='policy/%s' % name, reuse=reuse)
        with tf.variable_scope(scope):

            # Adding noise to break symmetry
            board_state = board_state + tf.random_normal(tf.shape(board_state),
                                                         stddev=0.01)
            graph_conv = tf.layers.Dense(units=hps('gcn_size'),
                                         activation=relu)(board_state)

            # First and intermediate layers
            for layer_idx in range(hps('nb_graph_conv') - 1):
                graph_conv = film_gcn_res_block(
                    inputs=graph_conv,  # (b, NB_NODES, gcn_size)
                    gamma=film_gammas[layer_idx],
                    beta=film_betas[layer_idx],
                    gcn_out_dim=hps('gcn_size'),
                    norm_adjacency=norm_adjacency,
                    is_training=pholder('is_training'),
                    residual=True)

            # Last layer
            graph_conv = film_gcn_res_block(
                inputs=graph_conv,  # (b, NB_NODES, final_size)
                gamma=film_gammas[-1],
                beta=film_betas[-1],
                gcn_out_dim=hps('attn_size') // 2,
                norm_adjacency=norm_adjacency,
                is_training=pholder('is_training'),
                residual=False)

        # Returning
        return graph_conv
Esempio n. 5
0
def convert_to_noisy_variables(variables, activation=None):
    """ Converts a list of variables to noisy variables
        :param variables: A list of variables to make noisy
        :param activation: Optional. The activation function to use on the linear noisy transformation
        :return: Nothing, but modifies the graph in-place

        Reference: 1706.10295 - Noisy Networks for exploration
    """
    if tf.get_collection(tf.GraphKeys.TRAIN_OP):
        raise RuntimeError(
            'You must call convert_to_noisy_variables before applying an optimizer on the graph.'
        )

    graph = tf.get_default_graph()
    if not isinstance(variables, list):
        variables = list(variables)

    # Replacing each variable
    for variable in variables:
        variable_read_op = _get_variable_read_op(variable, graph)
        variable_outputs = _get_variable_outputs(variable_read_op, graph)
        variable_scope = variable.name.split(':')[0]
        variable_shape = variable.shape.as_list()
        fan_in = variable_shape[0]

        # Creating noisy variables
        with tf.variable_scope(variable_scope + '_noisy'):
            with tf.device(variable.device):
                s_init = tf.constant_initializer(0.5 / sqrt(fan_in))

                noisy_u = tf.identity(variable, name='mu')
                noisy_s = tf.get_variable(
                    name='sigma',
                    shape=variable.shape,
                    dtype=tf.float32,
                    initializer=s_init,
                    caching_device=variable._caching_device)  # pylint: disable=protected-access
                noise = tf.random.normal(shape=variable_shape)

                replaced_var = noisy_u + noisy_s * noise
                replaced_var = activation(
                    replaced_var) if activation else replaced_var

        # Replacing in-place
        inputs_index = [
            var_index for var_index, var_input in enumerate(
                graph_editor.sgv(*variable_outputs).inputs) if
            var_input.name.split(':')[0] == variable_read_op.name.split(':')[0]
        ]
        graph_editor.connect(
            graph_editor.sgv(replaced_var.op),
            graph_editor.sgv(*variable_outputs).remap_inputs(inputs_index),
            disconnect_first=True)
Esempio n. 6
0
    def _encode_board(self, board_state, name, reuse=None):
        """ Encodes a board state or prev orders state
            :param board_state: The board state / prev orders state to encode - (batch, NB_NODES, initial_features)
            :param name: The name to use for the encoding
            :param reuse: Whether to reuse or not the weights from another encoding operation
            :return: The encoded board state / prev_orders state
        """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import GraphConvolution, preprocess_adjacency
        from diplomacy_research.utils.tensorflow import batch_norm

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu

        # Computing norm adjacency
        norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
        norm_adjacency = tf.tile(tf.expand_dims(norm_adjacency, axis=0), [tf.shape(board_state)[0], 1, 1])

        # Building scope
        scope = tf.VariableScope(name='policy/%s' % name, reuse=reuse)
        with tf.variable_scope(scope):

            # Adding noise to break symmetry
            board_state = board_state + tf.random_normal(tf.shape(board_state), stddev=0.01)
            graph_conv = board_state

            # First Layer
            graph_conv = GraphConvolution(input_dim=graph_conv.shape[-1].value,             # (b, NB_NODES, gcn_size)
                                          output_dim=hps('gcn_size'),
                                          norm_adjacency=norm_adjacency,
                                          activation_fn=relu,
                                          bias=True)(graph_conv)

            # Intermediate Layers
            for _ in range(1, hps('nb_graph_conv') - 1):
                graph_conv = GraphConvolution(input_dim=hps('gcn_size'),                    # (b, NB_NODES, gcn_size)
                                              output_dim=hps('gcn_size'),
                                              norm_adjacency=norm_adjacency,
                                              activation_fn=relu,
                                              bias=True)(graph_conv)
                graph_conv = batch_norm(graph_conv, is_training=pholder('is_training'), fused=True)

            # Final Layer
            graph_conv = GraphConvolution(input_dim=hps('gcn_size'),                        # (b, NB_NODES, attn_size)
                                          output_dim=hps('attn_size'),
                                          norm_adjacency=norm_adjacency,
                                          activation_fn=None,
                                          bias=True)(graph_conv)

        # Returning
        return graph_conv
Esempio n. 7
0
    def _build_policy_final(self):
        """ Builds the policy model (final step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.attention import StaticAttentionWrapper
        from diplomacy_research.models.layers.beam_decoder import DiverseBeamSearchDecoder
        from diplomacy_research.models.layers.decoder import MaskedBasicDecoder
        from diplomacy_research.models.layers.dropout import SeededDropoutWrapper
        from diplomacy_research.models.layers.dynamic_decode import dynamic_decode
        from diplomacy_research.models.policy.token_based.helper import CustomHelper, CustomBeamHelper
        from diplomacy_research.utils.tensorflow import cross_entropy, sequence_loss, to_int32, to_float, get_tile_beam

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):

                # Features
                player_seeds = self.features['player_seed']                 # tf.int32 - (b,)
                temperature = self.features['temperature']                  # tf,flt32 - (b,)
                dropout_rates = self.features['dropout_rate']               # tf.flt32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Outputs (from initial steps)
                batch_size = self.outputs['batch_size']
                board_alignments = self.outputs['board_alignments']
                decoder_inputs = self.outputs['decoder_inputs']
                decoder_mask = self.outputs['decoder_mask']
                decoder_type = self.outputs['decoder_type']
                raw_decoder_lengths = self.outputs['raw_decoder_lengths']
                decoder_lengths = self.outputs['decoder_lengths']
                board_state_conv = self.outputs['board_state_conv']
                word_embedding = self.outputs['word_embedding']

                # --- Decoding ---
                with tf.variable_scope('decoder_scope', reuse=tf.AUTO_REUSE):
                    lstm_cell = tf.contrib.rnn.LSTMBlockCell(hps('lstm_size'))

                    # decoder output to token
                    decoder_output_layer = tf.layers.Dense(units=VOCABULARY_SIZE,
                                                           activation=None,
                                                           kernel_initializer=tf.random_normal_initializer,
                                                           use_bias=True)

                    # ======== Regular Decoding ========
                    # Applying dropout to input + attention and to output layer
                    decoder_cell = SeededDropoutWrapper(cell=lstm_cell,
                                                        seeds=player_seeds,
                                                        input_keep_probs=1. - dropout_rates,
                                                        output_keep_probs=1. - dropout_rates,
                                                        variational_recurrent=hps('use_v_dropout'),
                                                        input_size=hps('word_emb_size') + hps('attn_size'),
                                                        dtype=tf.float32)

                    # Apply attention over orderable location at each position
                    decoder_cell = StaticAttentionWrapper(cell=decoder_cell,
                                                          memory=board_state_conv,
                                                          alignments=board_alignments,
                                                          sequence_length=raw_decoder_lengths,
                                                          output_attention=False)

                    # Setting initial state
                    decoder_init_state = decoder_cell.zero_state(batch_size, tf.float32)

                    # ---- Helper ----
                    helper = CustomHelper(decoder_type=decoder_type,
                                          inputs=decoder_inputs[:, :-1],
                                          embedding=word_embedding,
                                          sequence_length=decoder_lengths,
                                          mask=decoder_mask,
                                          time_major=False,
                                          softmax_temperature=temperature)

                    # ---- Decoder ----
                    sequence_mask = tf.sequence_mask(raw_decoder_lengths,
                                                     maxlen=tf.reduce_max(decoder_lengths),
                                                     dtype=tf.float32)
                    maximum_iterations = TOKENS_PER_ORDER * NB_SUPPLY_CENTERS
                    model_decoder = MaskedBasicDecoder(cell=decoder_cell,
                                                       helper=helper,
                                                       initial_state=decoder_init_state,
                                                       output_layer=decoder_output_layer,
                                                       extract_state=True)
                    training_results, _, _ = dynamic_decode(decoder=model_decoder,
                                                            output_time_major=False,
                                                            maximum_iterations=maximum_iterations,
                                                            swap_memory=hps('swap_memory'))
                    global_vars_after_decoder = set(tf.global_variables())

                    # ======== Beam Search Decoding ========
                    tile_beam = get_tile_beam(hps('beam_width'))

                    # Applying dropout to input + attention and to output layer
                    decoder_cell = SeededDropoutWrapper(cell=lstm_cell,
                                                        seeds=tile_beam(player_seeds),
                                                        input_keep_probs=tile_beam(1. - dropout_rates),
                                                        output_keep_probs=tile_beam(1. - dropout_rates),
                                                        variational_recurrent=hps('use_v_dropout'),
                                                        input_size=hps('word_emb_size') + hps('attn_size'),
                                                        dtype=tf.float32)

                    # Apply attention over orderable location at each position
                    decoder_cell = StaticAttentionWrapper(cell=decoder_cell,
                                                          memory=tile_beam(board_state_conv),
                                                          alignments=tile_beam(board_alignments),
                                                          sequence_length=tile_beam(raw_decoder_lengths),
                                                          output_attention=False)

                    # Setting initial state
                    decoder_init_state = decoder_cell.zero_state(batch_size * hps('beam_width'), tf.float32)

                    # ---- Beam Helper and Decoder ----
                    beam_helper = CustomBeamHelper(cell=decoder_cell,
                                                   embedding=word_embedding,
                                                   mask=decoder_mask,
                                                   sequence_length=decoder_lengths,
                                                   output_layer=decoder_output_layer,
                                                   initial_state=decoder_init_state,
                                                   beam_width=hps('beam_width'))
                    beam_decoder = DiverseBeamSearchDecoder(beam_helper=beam_helper,
                                                            sequence_length=decoder_lengths,
                                                            nb_groups=hps('beam_groups'))
                    beam_results, beam_state, _ = dynamic_decode(decoder=beam_decoder,
                                                                 output_time_major=False,
                                                                 maximum_iterations=maximum_iterations,
                                                                 swap_memory=hps('swap_memory'))

                    # Making sure we haven't created new global variables
                    assert not set(tf.global_variables()) - global_vars_after_decoder, 'New global vars were created'

                    # Processing results
                    logits = training_results.rnn_output                            # (b, dec_len, VOCAB_SIZE)
                    logits_length = tf.shape(logits)[1]                             # dec_len
                    decoder_target = decoder_inputs[:, 1:1 + logits_length]

                    # Selected tokens are the token that was actually fed at the next position
                    sample_mask = to_float(tf.math.equal(training_results.sample_id, -1))
                    selected_tokens = to_int32(
                        sequence_mask * (sample_mask * to_float(decoder_target)
                                         + (1. - sample_mask) * to_float(training_results.sample_id)))

                    # Argmax tokens are the most likely token outputted at each position
                    argmax_tokens = to_int32(to_float(tf.argmax(logits, axis=-1)) * sequence_mask)
                    log_probs = -1. * cross_entropy(logits=logits, labels=selected_tokens) * sequence_mask

                # Computing policy loss
                with tf.variable_scope('policy_loss'):
                    policy_loss = sequence_loss(logits=logits,
                                                targets=decoder_target,
                                                weights=sequence_mask,
                                                average_across_batch=True,
                                                average_across_timesteps=True)
                    policy_loss = tf.cond(stop_gradient_all,
                                          lambda: tf.stop_gradient(policy_loss),                                        # pylint: disable=cell-var-from-loop
                                          lambda: policy_loss)                                                          # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {'tag/policy/token_based/v005_markovian_film_board_align': True,
                   'targets': decoder_inputs[:, 1:],
                   'selected_tokens': selected_tokens,
                   'argmax_tokens': argmax_tokens,
                   'logits': logits,
                   'log_probs': log_probs,
                   'beam_tokens': tf.transpose(beam_results.predicted_ids, perm=[0, 2, 1]),     # [batch, beam, steps]
                   'beam_log_probs': beam_state.log_probs,
                   'rnn_states': training_results.rnn_state,
                   'policy_loss': policy_loss,
                   'draw_prob': self.outputs.get('draw_prob', tf.zeros_like(self.features['draw_target'])),
                   'learning_rate': self.learning_rate}

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 8
0
    def _build_policy_initial(self):
        """ Builds the policy model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.initializers import uniform
        from diplomacy_research.utils.tensorflow import build_sparse_batched_tensor, pad_axis, to_float, to_bool

        if not self.placeholders:
            self.placeholders = self.get_placeholders()

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):

                # Features
                board_state = to_float(self.features['board_state'])        # tf.flt32 - (b, NB_NODES, NB_FEATURES)
                board_alignments = to_float(self.features['board_alignments'])         # (b, NB_NODES * len)
                decoder_inputs = self.features['decoder_inputs']            # tf.int32 - (b, <= 1 + TOK/ORD * NB_SCS)
                decoder_lengths = self.features['decoder_lengths']          # tf.int32 - (b,)
                current_power = self.features['current_power']              # tf.int32 - (b,)
                current_season = self.features['current_season']            # tf.int32 - (b,)
                dropout_rates = self.features['dropout_rate']               # tf.flt32 - (b,)

                # Batch size
                batch_size = tf.shape(board_state)[0]

                # Reshaping board alignments
                board_alignments = tf.reshape(board_alignments, [batch_size, -1, NB_NODES])
                board_alignments /= tf.math.maximum(1., tf.reduce_sum(board_alignments, axis=-1, keepdims=True))

                # Building decoder mask
                decoder_mask_indices = self.features['decoder_mask_indices']    # tf.int64 - (b, 3 * len)
                decoder_mask_shape = self.proto_fields['decoder_mask'].shape

                # Overriding dropout_rates if pholder('dropout_rate') > 0
                dropout_rates = tf.cond(tf.greater(pholder('dropout_rate'), 0.),
                                        true_fn=lambda: tf.zeros_like(dropout_rates) + pholder('dropout_rate'),
                                        false_fn=lambda: dropout_rates)

                # Padding inputs
                board_alignments = pad_axis(board_alignments, axis=1, min_size=tf.reduce_max(decoder_lengths))
                decoder_inputs = pad_axis(decoder_inputs, axis=-1, min_size=2)
                decoder_mask_indices = pad_axis(decoder_mask_indices, axis=-1, min_size=len(decoder_mask_shape))

                # Reshaping to (b, len, 3)
                # decoder_mask is -- tf.bool (batch, TOK/ORD * NB_SC, VOCAB_SIZE, VOCAB_SIZE)
                decoder_mask_indices = tf.reshape(decoder_mask_indices, [batch_size, -1, len(decoder_mask_shape)])
                decoder_mask = build_sparse_batched_tensor(decoder_mask_indices,
                                                           value=True,
                                                           dtype=tf.bool,
                                                           dense_shape=decoder_mask_shape)

                # Making sure all RNN lengths are at least 1
                # No need to trim, because the fields are variable length
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Placeholders
                decoder_type = tf.reduce_max(pholder('decoder_type'))
                is_training = pholder('is_training')

                # Computing FiLM Gammas and Betas
                with tf.variable_scope('film_scope'):
                    power_embedding = uniform(name='power_embedding',
                                              shape=[NB_POWERS, hps('power_emb_size')],
                                              scale=1.)
                    current_power_mask = tf.one_hot(current_power, NB_POWERS, dtype=tf.float32)
                    current_power_embedding = tf.reduce_sum(power_embedding[None]
                                                            * current_power_mask[:, :, None], axis=1)  # (b, power_emb)
                    film_embedding_input = current_power_embedding

                    # Also conditioning on current_season
                    season_embedding = uniform(name='season_embedding',
                                               shape=[NB_SEASONS, hps('season_emb_size')],
                                               scale=1.)
                    current_season_mask = tf.one_hot(current_season, NB_SEASONS, dtype=tf.float32)
                    current_season_embedding = tf.reduce_sum(season_embedding[None]                # (b,season_emb)
                                                             * current_season_mask[:, :, None], axis=1)
                    film_embedding_input = tf.concat([film_embedding_input, current_season_embedding], axis=1)

                    film_output_dims = [hps('gcn_size')] * (hps('nb_graph_conv') - 1) + [hps('attn_size')]
                    film_weights = tf.layers.Dense(units=2 * sum(film_output_dims),         # (b, 1, 750)
                                                   use_bias=True,
                                                   activation=None)(film_embedding_input)[:, None, :]
                    film_gammas, film_betas = tf.split(film_weights, 2, axis=2)             # (b, 1, 750)
                    film_gammas = tf.split(film_gammas, film_output_dims, axis=2)
                    film_betas = tf.split(film_betas, film_output_dims, axis=2)

                    # Storing as temporary output
                    self.add_output('_board_state_conv_film_gammas', film_gammas)
                    self.add_output('_board_state_conv_film_betas', film_betas)

                # Creating graph convolution
                with tf.variable_scope('graph_conv_scope'):
                    assert hps('nb_graph_conv') >= 2

                    # Encoding board state
                    board_state_0yr_conv = self.encode_board(board_state, name='board_state_conv')
                    board_state_conv = self.get_board_state_conv(board_state_0yr_conv, is_training)

                # Creating word embedding vector (to embed word_ix)
                # Embeddings needs to be cached locally on the worker, otherwise TF can't compute their gradients
                with tf.variable_scope('word_embedding_scope'):
                    # embedding:    (voc_size, 256)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    word_embedding = uniform(name='word_embedding',
                                             shape=[VOCABULARY_SIZE, hps('word_emb_size')],
                                             scale=1.,
                                             caching_device=caching_device)

        # Building output tags
        outputs = {'batch_size': batch_size,
                   'board_alignments': board_alignments,
                   'decoder_inputs': decoder_inputs,
                   'decoder_mask': decoder_mask,
                   'decoder_type': decoder_type,
                   'raw_decoder_lengths': raw_decoder_lengths,
                   'decoder_lengths': decoder_lengths,
                   'board_state_conv': board_state_conv,
                   'board_state_0yr_conv': board_state_0yr_conv,
                   'word_embedding': word_embedding,
                   'in_retreat_phase': tf.math.logical_and(         # 1) board not empty, 2) disl. units present
                       tf.reduce_sum(board_state[:], axis=[1, 2]) > 0,
                       tf.math.logical_not(to_bool(tf.reduce_min(board_state[:, :, 23], -1))))}

        # Adding to graph
        self.add_meta_information(outputs)
Esempio n. 9
0
    def _build_draw_initial(self):
        """ Builds the draw model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import GraphConvolution, preprocess_adjacency
        from diplomacy_research.utils.tensorflow import to_float

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu
        sigmoid = tf.nn.sigmoid

        # Training loop
        with tf.variable_scope('draw', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                board_state = to_float(
                    self.features['board_state']
                )  # tf.float32 - (b, NB_NODES, NB_FEATURES)
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                draw_target = self.features['draw_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Norm Adjacency
                batch_size = tf.shape(board_state)[0]
                norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
                norm_adjacency = tf.tile(
                    tf.expand_dims(norm_adjacency, axis=0), [batch_size, 1, 1])

                # Graph embeddings
                with tf.variable_scope('graph_conv_scope'):
                    board_state_h0 = board_state  # (b, 81, 35)
                    board_state_h1 = GraphConvolution(
                        input_dim=NB_FEATURES,
                        output_dim=hps('draw_gcn_1_output_size'),
                        norm_adjacency=norm_adjacency,
                        activation_fn=relu,
                        bias=True)(board_state_h0)  # (b, 81, 25)

                    # board_state_h2: (b, 2025)
                    # board_state_h3: (b, 128)
                    board_state_h2 = tf.reshape(
                        board_state_h1,
                        shape=[-1, NB_NODES * hps('draw_gcn_1_output_size')])
                    board_state_graph_conv = tf.layers.Dense(
                        units=hps('draw_embedding_size'),
                        activation=relu,
                        use_bias=True)(board_state_h2)

                # Calculating draw for all powers
                with tf.variable_scope('draw_scope'):
                    current_power_mask = tf.one_hot(current_power,
                                                    NB_POWERS,
                                                    dtype=tf.float32)

                    draw_h0 = board_state_graph_conv  # (b, 128)
                    draw_h1 = tf.layers.Dense(
                        units=hps('draw_h1_size'),  # (b, 64)
                        activation=relu,
                        use_bias=True)(draw_h0)
                    draw_h2 = tf.layers.Dense(
                        units=hps('draw_h2_size'),  # (b, 64)
                        activation=relu,
                        use_bias=True)(draw_h1)
                    draw_probs = tf.layers.Dense(
                        units=NB_POWERS,  # (b, 7)
                        activation=sigmoid,
                        use_bias=True)(draw_h2)
                    draw_prob = tf.reduce_sum(draw_probs * current_power_mask,
                                              axis=1)  # (b,)

                # Computing draw loss
                with tf.variable_scope('draw_loss'):
                    draw_loss = tf.reduce_mean(
                        tf.square(draw_target - draw_prob))
                    draw_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(draw_loss),  # pylint: disable=cell-var-from-loop
                        lambda: draw_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/draw/v001_draw_relu': True,
            'draw_prob': draw_prob,
            'draw_loss': draw_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 10
0
    def build(self):
        """ Builds the RL model using the correct optimizer """
        from diplomacy_research.utils.tensorflow import tf, tfp, normalize, to_float
        from diplomacy_research.models.layers.avg_grad_optimizer import AvgGradOptimizer

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.model.hparams[hparam_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Placeholders
                stop_gradient_all = self.model.placeholders[
                    'stop_gradient_all']

                # Features
                decoder_lengths = self.model.features[
                    'decoder_lengths']  # tf.int32   - (b,)
                draw_action = self.model.features[
                    'draw_action']  # tf.bool    - (b,)
                reward_target = self.model.features[
                    'reward_target']  # tf.float32 - (b,)
                value_target = self.model.features[
                    'value_target']  # tf.float32 - (b,)
                old_log_probs = self.model.features[
                    'old_log_probs']  # tf.float32 - (b, dec_len)
                # current_power = self.model.features['current_power']              # tf.int32   - (b,)

                # Making sure all RNN lengths are at least 1
                # Trimming to the maximum decoder length in the batch
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Retrieving model outputs
                baseline = values = self.model.outputs['state_value']  # (b,)
                logits = self.model.outputs['logits']  # (b, dec, VOCAB)
                sequence_mask = tf.sequence_mask(
                    raw_decoder_lengths,  # (b, dec)
                    maxlen=tf.reduce_max(decoder_lengths),
                    dtype=tf.float32)

                # Computing Baseline Mean Square Error Loss
                with tf.variable_scope('baseline_scope'):
                    baseline_mse_loss = tf.minimum(
                        tf.square(value_target - values),
                        hps('clip_value_threshold'))
                    baseline_mse_loss = tf.reduce_sum(baseline_mse_loss)  # ()

                # Calculating surrogate loss
                with tf.variable_scope('policy_gradient_scope'):
                    new_policy_log_probs = self.model.outputs[
                        'log_probs'] * sequence_mask  # (b, dec_len)
                    old_policy_log_probs = old_log_probs * sequence_mask  # (b, dec_len)

                    new_sum_log_probs = tf.reduce_sum(new_policy_log_probs,
                                                      axis=-1)  # (b,)
                    old_sum_log_probs = tf.reduce_sum(old_policy_log_probs,
                                                      axis=-1)  # (b,)

                    ratio = tf.math.exp(new_sum_log_probs -
                                        old_sum_log_probs)  # (b,)
                    clipped_ratio = tf.clip_by_value(ratio,
                                                     1. - hps('epsilon'), 1. +
                                                     hps('epsilon'))  # (b,)
                    advantages = tf.stop_gradient(
                        normalize(reward_target - baseline))  # (b,)

                    surrogate_loss_1 = ratio * advantages  # (b,)
                    surrogate_loss_2 = clipped_ratio * advantages  # (b,)
                    surrogate_loss = -tf.reduce_mean(
                        tf.math.minimum(surrogate_loss_1,
                                        surrogate_loss_2))  # ()

                # Calculating policy gradient for draw action
                with tf.variable_scope('draw_gradient_scope'):
                    draw_action = to_float(draw_action)  # (b,)
                    draw_prob = self.model.outputs['draw_prob']  # (b,)
                    log_prob_of_draw = draw_action * tf.log(draw_prob) + (
                        1. - draw_action) * tf.log(1. - draw_prob)
                    draw_gradient_loss = -1. * log_prob_of_draw * advantages  # (b,)
                    draw_gradient_loss = tf.reduce_mean(
                        draw_gradient_loss)  # ()

                # Calculating entropy loss
                with tf.variable_scope('entropy_scope'):
                    entropy = tfp.distributions.Categorical(
                        logits=logits).entropy()
                    entropy_loss = -tf.reduce_mean(entropy)  # ()

                # Scopes
                scope = ['policy', 'value', 'draw']
                global_ignored_scope = None if not hps(
                    'ignored_scope') else hps('ignored_scope').split(',')

                # Creating PPO loss
                ppo_loss = surrogate_loss \
                           + hps('value_coeff') * baseline_mse_loss \
                           + hps('draw_coeff') * draw_gradient_loss \
                           + hps('entropy_coeff') * entropy_loss
                ppo_loss = tf.cond(
                    stop_gradient_all,
                    lambda: tf.stop_gradient(ppo_loss),  # pylint: disable=cell-var-from-loop
                    lambda: ppo_loss)  # pylint: disable=cell-var-from-loop
                cost_and_scope = [(ppo_loss, scope, None)]

                # Creating optimizer op
                ppo_op = self.model.create_optimizer_op(
                    cost_and_scope=cost_and_scope,
                    ignored_scope=global_ignored_scope,
                    max_gradient_norm=hps('max_gradient_norm'))

                # Making sure we are not using the AvgGradOptimizer, but directly the AdamOptimizer
                assert not isinstance(
                    self.model.optimizer,
                    AvgGradOptimizer), 'PPO does not use AvgGradOptimizer'

        # Storing outputs
        self._add_output('rl_policy_loss', surrogate_loss)
        self._add_output('rl_value_loss', baseline_mse_loss)
        self._add_output('rl_draw_loss', draw_gradient_loss)
        self._add_output('rl_entropy_loss', entropy_loss)
        self._add_output('rl_total_loss', ppo_loss)
        self._add_output('optimizer_op', ppo_op)

        # --------------------------------------
        #               Hooks
        # --------------------------------------
        def hook_baseline_pre_condition(dataset):
            """ Pre-Condition: First queue to run """
            if not hasattr(dataset, 'last_queue') or dataset.last_queue == '':
                return True
            return False

        def hook_baseline_post_queue(dataset):
            """ Post-Queue: Marks the baseline queue as processed """
            dataset.last_queue = 'ppo_policy_baseline'

        # --------------------------------------
        #               Queues
        # --------------------------------------
        self.queue_dataset.create_queue(
            'ppo_policy_baseline',
            placeholders={
                self.model.placeholders['decoder_type']: [TRAINING_DECODER]
            },
            outputs=[
                self.model.outputs[output_name]
                for output_name in ['optimizer_op'] +
                self.get_evaluation_tags()
            ],
            pre_condition=hook_baseline_pre_condition,
            post_queue=hook_baseline_post_queue)
        self.queue_dataset.create_queue(
            'ppo_increase_version',
            placeholders={
                self.model.placeholders['decoder_type']: [GREEDY_DECODER]
            },
            outputs=[tf.assign_add(self.version_step, 1)],
            with_status=True)
Esempio n. 11
0
    def _build_policy_initial(self):
        """ Builds the policy model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.initializers import uniform
        from diplomacy_research.utils.tensorflow import build_sparse_batched_tensor, pad_axis, to_float, to_bool

        if not self.placeholders:
            self.placeholders = self.get_placeholders()

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):

                # Features
                board_state = to_float(self.features['board_state'])        # tf.flt32 - (b, NB_NODES, NB_FEATURES)
                decoder_inputs = self.features['decoder_inputs']            # tf.int32 - (b, <= 1 + TOK/ORD * NB_SCS)
                decoder_lengths = self.features['decoder_lengths']          # tf.int32 - (b,)
                dropout_rates = self.features['dropout_rate']               # tf.flt32 - (b,)

                # Batch size
                batch_size = tf.shape(board_state)[0]

                # Building decoder mask
                decoder_mask_indices = self.features['decoder_mask_indices']    # tf.int64 - (b, 3 * len)
                decoder_mask_shape = self.proto_fields['decoder_mask'].shape

                # Overriding dropout_rates if pholder('dropout_rate') > 0
                dropout_rates = tf.cond(tf.greater(pholder('dropout_rate'), 0.),
                                        true_fn=lambda: tf.zeros_like(dropout_rates) + pholder('dropout_rate'),
                                        false_fn=lambda: dropout_rates)

                # Padding inputs
                decoder_inputs = pad_axis(decoder_inputs, axis=-1, min_size=2)
                decoder_mask_indices = pad_axis(decoder_mask_indices, axis=-1, min_size=len(decoder_mask_shape))

                # Reshaping to (b, len, 3)
                # decoder_mask is -- tf.bool (batch, TOK/ORD * NB_SC, VOCAB_SIZE, VOCAB_SIZE)
                decoder_mask_indices = tf.reshape(decoder_mask_indices, [batch_size, -1, len(decoder_mask_shape)])
                decoder_mask = build_sparse_batched_tensor(decoder_mask_indices,
                                                           value=True,
                                                           dtype=tf.bool,
                                                           dense_shape=decoder_mask_shape)

                # Making sure all RNN lengths are at least 1
                # No need to trim, because the fields are variable length
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Placeholders
                decoder_type = tf.reduce_max(pholder('decoder_type'))

                # Creating word embedding vector (to embed word_ix)
                # Embeddings needs to be cached locally on the worker, otherwise TF can't compute their gradients
                with tf.variable_scope('word_embedding_scope'):
                    # embedding:    (voc_size, 256)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    word_embedding = uniform(name='word_embedding',
                                             shape=[VOCABULARY_SIZE, hps('word_emb_size')],
                                             scale=1.,
                                             caching_device=caching_device)

        # Building output tags
        outputs = {'batch_size': batch_size,
                   'decoder_inputs': decoder_inputs,
                   'decoder_mask': decoder_mask,
                   'decoder_type': decoder_type,
                   'raw_decoder_lengths': raw_decoder_lengths,
                   'decoder_lengths': decoder_lengths,
                   'board_state_conv': tf.zeros([batch_size, NB_NODES, 0], dtype=tf.float32),
                   'board_state_0yr_conv': tf.zeros([batch_size, NB_NODES, 0], dtype=tf.float32),
                   'word_embedding': word_embedding,
                   'in_retreat_phase': tf.math.logical_and(         # 1) board not empty, 2) disl. units present
                       tf.reduce_sum(board_state[:], axis=[1, 2]) > 0,
                       tf.math.logical_not(to_bool(tf.reduce_min(board_state[:, :, 23], -1))))}

        # Adding to graph
        self.add_meta_information(outputs)
Esempio n. 12
0
    def _build_policy_final(self):
        """ Builds the policy model (final step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.attention import AttentionWrapper, BahdanauAttention
        from diplomacy_research.models.layers.beam_decoder import DiverseBeamSearchDecoder
        from diplomacy_research.models.layers.decoder import CandidateBasicDecoder
        from diplomacy_research.models.layers.dropout import SeededDropoutWrapper
        from diplomacy_research.models.layers.dynamic_decode import dynamic_decode
        from diplomacy_research.models.policy.order_based.helper import CustomHelper, CustomBeamHelper
        from diplomacy_research.utils.tensorflow import cross_entropy, sequence_loss, to_int32, to_float, get_tile_beam

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):

                # Features
                player_seeds = self.features['player_seed']                 # tf.int32 - (b,)
                temperature = self.features['temperature']                  # tf,flt32 - (b,)
                dropout_rates = self.features['dropout_rate']               # tf.flt32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Outputs (from initial steps)
                batch_size = self.outputs['batch_size']
                decoder_inputs = self.outputs['decoder_inputs']
                decoder_type = self.outputs['decoder_type']
                raw_decoder_lengths = self.outputs['raw_decoder_lengths']
                decoder_lengths = self.outputs['decoder_lengths']
                board_state_conv = self.outputs['board_state_conv']
                order_embedding = self.outputs['order_embedding']
                candidate_embedding = self.outputs['candidate_embedding']
                candidates = self.outputs['candidates']
                max_candidate_length = self.outputs['max_candidate_length']

                # --- Decoding ---
                with tf.variable_scope('decoder_scope', reuse=tf.AUTO_REUSE):
                    lstm_cell = tf.contrib.rnn.LSTMBlockCell(hps('lstm_size'))

                    # ======== Regular Decoding ========
                    # Applying dropout to input + attention and to output layer
                    decoder_cell = SeededDropoutWrapper(cell=lstm_cell,
                                                        seeds=player_seeds,
                                                        input_keep_probs=1. - dropout_rates,
                                                        output_keep_probs=1. - dropout_rates,
                                                        variational_recurrent=hps('use_v_dropout'),
                                                        input_size=hps('order_emb_size') + hps('attn_size'),
                                                        dtype=tf.float32)

                    # apply attention over location
                    # curr_state [batch, NB_NODES, attn_size]
                    attention_scope = tf.VariableScope(name='policy/decoder_scope/Attention', reuse=tf.AUTO_REUSE)
                    attention_mechanism = BahdanauAttention(num_units=hps('attn_size'),
                                                            memory=board_state_conv,
                                                            normalize=True,
                                                            name_or_scope=attention_scope)
                    decoder_cell = AttentionWrapper(cell=decoder_cell,
                                                    attention_mechanism=attention_mechanism,
                                                    output_attention=False,
                                                    name_or_scope=attention_scope)

                    # Setting initial state
                    decoder_init_state = decoder_cell.zero_state(batch_size, tf.float32)
                    decoder_init_state = decoder_init_state.clone(attention=tf.reduce_mean(board_state_conv, axis=1))

                    # ---- Helper ----
                    helper = CustomHelper(decoder_type=decoder_type,
                                          inputs=decoder_inputs[:, :-1],
                                          order_embedding=order_embedding,
                                          candidate_embedding=candidate_embedding,
                                          sequence_length=decoder_lengths,
                                          candidates=candidates,
                                          time_major=False,
                                          softmax_temperature=temperature)

                    # ---- Decoder ----
                    sequence_mask = tf.sequence_mask(raw_decoder_lengths,
                                                     maxlen=tf.reduce_max(decoder_lengths),
                                                     dtype=tf.float32)
                    maximum_iterations = NB_SUPPLY_CENTERS
                    model_decoder = CandidateBasicDecoder(cell=decoder_cell,
                                                          helper=helper,
                                                          initial_state=decoder_init_state,
                                                          max_candidate_length=max_candidate_length,
                                                          extract_state=True)
                    training_results, _, _ = dynamic_decode(decoder=model_decoder,
                                                            output_time_major=False,
                                                            maximum_iterations=maximum_iterations,
                                                            swap_memory=hps('swap_memory'))
                    global_vars_after_decoder = set(tf.global_variables())

                    # ======== Beam Search Decoding ========
                    tile_beam = get_tile_beam(hps('beam_width'))

                    # Applying dropout to input + attention and to output layer
                    decoder_cell = SeededDropoutWrapper(cell=lstm_cell,
                                                        seeds=tile_beam(player_seeds),
                                                        input_keep_probs=tile_beam(1. - dropout_rates),
                                                        output_keep_probs=tile_beam(1. - dropout_rates),
                                                        variational_recurrent=hps('use_v_dropout'),
                                                        input_size=hps('order_emb_size') + hps('attn_size'),
                                                        dtype=tf.float32)

                    # apply attention over location
                    # curr_state [batch, NB_NODES, attn_size]
                    attention_mechanism = BahdanauAttention(num_units=hps('attn_size'),
                                                            memory=tile_beam(board_state_conv),
                                                            normalize=True,
                                                            name_or_scope=attention_scope)
                    decoder_cell = AttentionWrapper(cell=decoder_cell,
                                                    attention_mechanism=attention_mechanism,
                                                    output_attention=False,
                                                    name_or_scope=attention_scope)

                    # Setting initial state
                    decoder_init_state = decoder_cell.zero_state(batch_size * hps('beam_width'), tf.float32)
                    decoder_init_state = decoder_init_state.clone(attention=tf.reduce_mean(tile_beam(board_state_conv),
                                                                                           axis=1))

                    # ---- Beam Helper and Decoder ----
                    beam_helper = CustomBeamHelper(cell=decoder_cell,
                                                   order_embedding=order_embedding,
                                                   candidate_embedding=candidate_embedding,
                                                   candidates=candidates,
                                                   sequence_length=decoder_lengths,
                                                   initial_state=decoder_init_state,
                                                   beam_width=hps('beam_width'))
                    beam_decoder = DiverseBeamSearchDecoder(beam_helper=beam_helper,
                                                            sequence_length=decoder_lengths,
                                                            nb_groups=hps('beam_groups'))
                    beam_results, beam_state, _ = dynamic_decode(decoder=beam_decoder,
                                                                 output_time_major=False,
                                                                 maximum_iterations=maximum_iterations,
                                                                 swap_memory=hps('swap_memory'))

                    # Making sure we haven't created new global variables
                    assert not set(tf.global_variables()) - global_vars_after_decoder, 'New global vars were created'

                    # Processing results
                    candidate_logits = training_results.rnn_output                  # (b, dec_len, max_cand_len)
                    logits_length = tf.shape(candidate_logits)[1]                   # dec_len
                    decoder_target = decoder_inputs[:, 1:1 + logits_length]

                    # Selected tokens are the token that was actually fed at the next position
                    sample_mask = to_float(tf.math.equal(training_results.sample_id, -1))
                    selected_tokens = to_int32(
                        sequence_mask * (sample_mask * to_float(decoder_target)
                                         + (1. - sample_mask) * to_float(training_results.sample_id)))

                    # Computing ArgMax tokens
                    argmax_id = to_int32(tf.argmax(candidate_logits, axis=-1))
                    max_nb_candidate = tf.shape(candidate_logits)[2]
                    candidate_ids = \
                        tf.reduce_sum(tf.one_hot(argmax_id, max_nb_candidate, dtype=tf.int32) * candidates, axis=-1)
                    argmax_tokens = to_int32(to_float(candidate_ids) * sequence_mask)

                    # Extracting the position of the target candidate
                    tokens_labels = tf.argmax(to_int32(tf.math.equal(selected_tokens[:, :, None], candidates)), -1)
                    target_labels = tf.argmax(to_int32(tf.math.equal(decoder_target[:, :, None], candidates)), -1)

                    # Log Probs
                    log_probs = -1. * cross_entropy(logits=candidate_logits, labels=tokens_labels) * sequence_mask

                # Computing policy loss
                with tf.variable_scope('policy_loss'):
                    policy_loss = sequence_loss(logits=candidate_logits,
                                                targets=target_labels,
                                                weights=sequence_mask,
                                                average_across_batch=True,
                                                average_across_timesteps=True)
                    policy_loss = tf.cond(stop_gradient_all,
                                          lambda: tf.stop_gradient(policy_loss),                                        # pylint: disable=cell-var-from-loop
                                          lambda: policy_loss)                                                          # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {'tag/policy/order_based/v001_markovian_no_film': True,
                   'targets': decoder_inputs[:, 1:],
                   'selected_tokens': selected_tokens,
                   'argmax_tokens': argmax_tokens,
                   'logits': candidate_logits,
                   'log_probs': log_probs,
                   'beam_tokens': tf.transpose(beam_results.predicted_ids, perm=[0, 2, 1]),     # [batch, beam, steps]
                   'beam_log_probs': beam_state.log_probs,
                   'rnn_states': training_results.rnn_state,
                   'policy_loss': policy_loss,
                   'draw_prob': self.outputs.get('draw_prob', tf.zeros_like(self.features['draw_target'])),
                   'learning_rate': self.learning_rate}

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 13
0
    def _build_policy_initial(self):
        """ Builds the policy model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.initializers import uniform
        from diplomacy_research.utils.tensorflow import pad_axis, to_int32, to_float, to_bool

        if not self.placeholders:
            self.placeholders = self.get_placeholders()

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.cluster_config else None):

                # Features
                board_state = to_float(self.features['board_state'])        # tf.flt32 - (b, NB_NODES, NB_FEATURES)
                decoder_inputs = self.features['decoder_inputs']            # tf.int32 - (b, <= 1 + NB_SCS)
                decoder_lengths = self.features['decoder_lengths']          # tf.int32 - (b,)
                candidates = self.features['candidates']                    # tf.int32 - (b, nb_locs * MAX_CANDIDATES)
                dropout_rates = self.features['dropout_rate']               # tf.flt32 - (b,)

                # Batch size
                batch_size = tf.shape(board_state)[0]

                # Overriding dropout_rates if pholder('dropout_rate') > 0
                dropout_rates = tf.cond(tf.greater(pholder('dropout_rate'), 0.),
                                        true_fn=lambda: tf.zeros_like(dropout_rates) + pholder('dropout_rate'),
                                        false_fn=lambda: dropout_rates)

                # Padding decoder_inputs and candidates
                decoder_inputs = pad_axis(decoder_inputs, axis=-1, min_size=2)
                candidates = pad_axis(candidates, axis=-1, min_size=MAX_CANDIDATES)

                # Making sure all RNN lengths are at least 1
                # No need to trim, because the fields are variable length
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Placeholders
                decoder_type = tf.reduce_max(pholder('decoder_type'))
                is_training = pholder('is_training')

                # Reshaping candidates
                candidates = tf.reshape(candidates, [batch_size, -1, MAX_CANDIDATES])
                candidates = candidates[:, :tf.reduce_max(decoder_lengths), :]      # tf.int32 - (b, nb_locs, MAX_CAN)

                # Creating graph convolution
                with tf.variable_scope('graph_conv_scope'):
                    assert hps('nb_graph_conv') >= 2

                    # Encoding board state
                    board_state_0yr_conv = self.encode_board(board_state, name='board_state_conv')
                    board_state_conv = self.get_board_state_conv(board_state_0yr_conv, is_training)

                # Creating order embedding vector (to embed order_ix)
                # Embeddings needs to be cached locally on the worker, otherwise TF can't compute their gradients
                with tf.variable_scope('order_embedding_scope'):
                    # embedding:    (order_vocab_size, 64)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    partitioner = tf.fixed_size_partitioner(NB_PARTITIONS) if hps('use_partitioner') else None
                    order_embedding = uniform(name='order_embedding',
                                              shape=[ORDER_VOCABULARY_SIZE, hps('order_emb_size')],
                                              scale=1.,
                                              partitioner=partitioner,
                                              caching_device=caching_device)

                # Creating candidate embedding
                with tf.variable_scope('candidate_embedding_scope'):
                    # embedding:    (order_vocab_size, 64)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    partitioner = tf.fixed_size_partitioner(NB_PARTITIONS) if hps('use_partitioner') else None
                    candidate_embedding = uniform(name='candidate_embedding',
                                                  shape=[ORDER_VOCABULARY_SIZE, hps('lstm_size') + 1],
                                                  scale=1.,
                                                  partitioner=partitioner,
                                                  caching_device=caching_device)

                # Trimming to the maximum number of candidates
                candidate_lengths = tf.reduce_sum(to_int32(tf.math.greater(candidates, PAD_ID)), -1)    # int32 - (b,)
                max_candidate_length = tf.math.maximum(1, tf.reduce_max(candidate_lengths))
                candidates = candidates[:, :, :max_candidate_length]

        # Building output tags
        outputs = {'batch_size': batch_size,
                   'decoder_inputs': decoder_inputs,
                   'decoder_type': decoder_type,
                   'raw_decoder_lengths': raw_decoder_lengths,
                   'decoder_lengths': decoder_lengths,
                   'board_state_conv': board_state_conv,
                   'board_state_0yr_conv': board_state_0yr_conv,
                   'order_embedding': order_embedding,
                   'candidate_embedding': candidate_embedding,
                   'candidates': candidates,
                   'max_candidate_length': max_candidate_length,
                   'in_retreat_phase': tf.math.logical_and(             # 1) board not empty, 2) disl. units present
                       tf.reduce_sum(board_state[:], axis=[1, 2]) > 0,
                       tf.math.logical_not(to_bool(tf.reduce_min(board_state[:, :, 23], -1))))}

        # Adding to graph
        self.add_meta_information(outputs)
Esempio n. 14
0
    def build(self):
        """ Builds the RL model using the correct optimizer """
        from diplomacy_research.utils.tensorflow import tf, tfp, normalize, to_float
        from diplomacy_research.models.layers.avg_grad_optimizer import AvgGradOptimizer

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.model.hparams[hparam_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Placeholders
                stop_gradient_all = self.model.placeholders[
                    'stop_gradient_all']

                # Features
                decoder_lengths = self.model.features[
                    'decoder_lengths']  # tf.int32   - (b,)
                draw_action = self.model.features[
                    'draw_action']  # tf.bool    - (b,)
                reward_target = self.model.features[
                    'reward_target']  # tf.float32 - (b,)
                value_target = self.model.features[
                    'value_target']  # tf.float32 - (b,)
                # current_power = self.model.features['current_power']              # tf.int32   - (b,)

                # Making sure all RNN lengths are at least 1
                # Trimming to the maximum decoder length in the batch
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Retrieving model outputs
                # Using a fixed baseline (e.g. moving average) rather than a parameterized value function
                baseline = value_target  # (b,)
                logits = self.model.outputs[
                    'logits']  # (b, dec_len, VOCAB_SIZE)
                sequence_mask = tf.sequence_mask(
                    raw_decoder_lengths,  # (b, dec)
                    maxlen=tf.reduce_max(decoder_lengths),
                    dtype=tf.float32)

                # Calculating policy gradient loss
                with tf.variable_scope('policy_gradient_scope'):
                    log_prob_of_tokens = self.model.outputs[
                        'log_probs'] * sequence_mask  # (b, dec_len)

                    # Calculating loss and optimizer op
                    advantages = tf.stop_gradient(
                        normalize(reward_target - baseline))  # (b,)
                    policy_gradient_loss = -tf.reduce_sum(
                        log_prob_of_tokens, axis=-1) * advantages  # (b,)
                    policy_gradient_loss = tf.reduce_mean(
                        policy_gradient_loss)  # ()

                # Calculating policy gradient for draw action
                with tf.variable_scope('draw_gradient_scope'):
                    draw_action = to_float(draw_action)  # (b,)
                    draw_prob = self.model.outputs['draw_prob']  # (b,)
                    log_prob_of_draw = draw_action * tf.log(draw_prob) + (
                        1. - draw_action) * tf.log(1. - draw_prob)
                    draw_gradient_loss = -1. * log_prob_of_draw * advantages  # (b,)
                    draw_gradient_loss = tf.reduce_mean(
                        draw_gradient_loss)  # ()

                # Calculating entropy loss
                with tf.variable_scope('entropy_scope'):
                    categorial_dist = tfp.distributions.Categorical(
                        logits=logits)
                    entropy = categorial_dist.entropy()
                    entropy_loss = -tf.reduce_mean(entropy)  # ()

                # Scopes
                scope = ['policy', 'draw']
                global_ignored_scope = [] if not hps('ignored_scope') else hps(
                    'ignored_scope').split(',')
                global_ignored_scope += ['value']

                # Creating REINFORCE loss with baseline
                reinforce_loss = policy_gradient_loss \
                                 + hps('draw_coeff') * draw_gradient_loss \
                                 + hps('entropy_coeff') * entropy_loss
                reinforce_loss = tf.cond(
                    stop_gradient_all,
                    lambda: tf.stop_gradient(reinforce_loss),  # pylint: disable=cell-var-from-loop
                    lambda: reinforce_loss)  # pylint: disable=cell-var-from-loop
                cost_and_scope = [(reinforce_loss, scope, None)]

                # Creating optimizer op
                reinforce_op = self.model.create_optimizer_op(
                    cost_and_scope=cost_and_scope,
                    ignored_scope=global_ignored_scope,
                    max_gradient_norm=None)  # AvgGradOptimizer will clip

                # Getting AvgGradOptimizer.update(version_step)
                assert isinstance(
                    self.model.optimizer,
                    AvgGradOptimizer), 'REINFORCE requires gradient averaging'
                update_op = self.model.optimizer.update(self.version_step)
                init_op = self.model.optimizer.init()

        # Storing outputs
        self._add_output('rl_policy_loss', policy_gradient_loss)
        self._add_output('rl_draw_loss', draw_gradient_loss)
        self._add_output('rl_entropy_loss', entropy_loss)
        self._add_output('rl_total_loss', reinforce_loss)
        self._add_output('optimizer_op', reinforce_op)
        self._add_output('update_op', update_op)
        self._add_output('init_op', init_op)

        # --------------------------------------
        #               Hooks
        # --------------------------------------
        def hook_baseline_pre_condition(dataset):
            """ Pre-Condition: First queue to run """
            if not hasattr(dataset, 'last_queue') or dataset.last_queue == '':
                return True
            return False

        def hook_baseline_post_queue(dataset):
            """ Post-Queue: Marks the baseline queue as processed """
            dataset.last_queue = 'reinforce_policy'

        def hook_update_pre_condition(dataset):
            """ Pre-Condition: last_queue must be baseline """
            if hasattr(
                    dataset,
                    'last_queue') and dataset.last_queue == 'reinforce_policy':
                return True
            return False

        def hook_update_pre_queue(dataset):
            """ Pre-Queue: Restricts the queue to 1 dequeue maximum """
            dataset.nb_items_to_pull_from_queue = min(
                dataset.nb_items_to_pull_from_queue, 1)

        def hook_update_post_queue(dataset):
            """ Post-Queue: Marks the update as processed """
            dataset.last_queue = 'reinforce_update'

        # --------------------------------------
        #               Queues
        # --------------------------------------
        self.queue_dataset.create_queue(
            'reinforce_policy',
            placeholders={
                self.model.placeholders['decoder_type']: [TRAINING_DECODER]
            },
            outputs=[
                self.model.outputs[output_name]
                for output_name in ['optimizer_op'] +
                self.get_evaluation_tags()
            ],
            with_status=True,
            pre_condition=hook_baseline_pre_condition,
            post_queue=hook_baseline_post_queue)
        self.queue_dataset.create_queue(
            'reinforce_update',
            placeholders={
                self.model.placeholders['decoder_type']: [GREEDY_DECODER]
            },
            outputs=[self.model.outputs['update_op']],
            with_status=True,
            pre_condition=hook_update_pre_condition,
            pre_queue=hook_update_pre_queue,
            post_queue=hook_update_post_queue)
        self.queue_dataset.create_queue(
            'optimizer_init',
            placeholders={
                self.model.placeholders['decoder_type']: [GREEDY_DECODER]
            },
            outputs=[self.model.outputs['init_op']],
            with_status=True)
Esempio n. 15
0
    def _build_policy_initial(self):
        """ Builds the policy model (initial step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.initializers import uniform
        from diplomacy_research.utils.tensorflow import pad_axis, to_int32, to_float, to_bool

        if not self.placeholders:
            self.placeholders = self.get_placeholders()

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                board_state = to_float(
                    self.features['board_state']
                )  # tf.flt32 - (b, NB_NODES, NB_FEATURES)
                board_alignments = to_float(
                    self.features['board_alignments'])  # (b, NB_NODES * len)
                prev_orders_state = to_float(
                    self.features['prev_orders_state']
                )  # (b, NB_PRV_OD, NB_ND, NB_OD_FT)
                decoder_inputs = self.features[
                    'decoder_inputs']  # tf.int32 - (b, <= 1 + NB_SCS)
                decoder_lengths = self.features[
                    'decoder_lengths']  # tf.int32 - (b,)
                candidates = self.features[
                    'candidates']  # tf.int32 - (b, nb_locs * MAX_CANDIDATES)
                current_power = self.features[
                    'current_power']  # tf.int32 - (b,)
                current_season = self.features[
                    'current_season']  # tf.int32 - (b,)
                dropout_rates = self.features[
                    'dropout_rate']  # tf.flt32 - (b,)

                # Batch size
                batch_size = tf.shape(board_state)[0]

                # Reshaping board alignments
                board_alignments = tf.reshape(board_alignments,
                                              [batch_size, -1, NB_NODES])
                board_alignments /= tf.math.maximum(
                    1., tf.reduce_sum(board_alignments, axis=-1,
                                      keepdims=True))

                # Overriding dropout_rates if pholder('dropout_rate') > 0
                dropout_rates = tf.cond(
                    tf.greater(pholder('dropout_rate'), 0.),
                    true_fn=lambda: tf.zeros_like(dropout_rates) + pholder(
                        'dropout_rate'),
                    false_fn=lambda: dropout_rates)

                # Padding decoder_inputs and candidates
                board_alignments = pad_axis(
                    board_alignments,
                    axis=1,
                    min_size=tf.reduce_max(decoder_lengths))
                decoder_inputs = pad_axis(decoder_inputs, axis=-1, min_size=2)
                candidates = pad_axis(candidates,
                                      axis=-1,
                                      min_size=MAX_CANDIDATES)

                # Making sure all RNN lengths are at least 1
                # No need to trim, because the fields are variable length
                raw_decoder_lengths = decoder_lengths
                decoder_lengths = tf.math.maximum(1, decoder_lengths)

                # Placeholders
                decoder_type = tf.reduce_max(pholder('decoder_type'))
                is_training = pholder('is_training')

                # Reshaping candidates
                candidates = tf.reshape(candidates,
                                        [batch_size, -1, MAX_CANDIDATES])
                candidates = candidates[:, :tf.reduce_max(
                    decoder_lengths), :]  # tf.int32 - (b, nb_locs, MAX_CAN)

                # Computing FiLM Gammas and Betas
                with tf.variable_scope('film_scope'):
                    power_embedding = uniform(
                        name='power_embedding',
                        shape=[NB_POWERS, hps('power_emb_size')],
                        scale=1.)
                    current_power_mask = tf.one_hot(current_power,
                                                    NB_POWERS,
                                                    dtype=tf.float32)
                    current_power_embedding = tf.reduce_sum(
                        power_embedding[None] * current_power_mask[:, :, None],
                        axis=1)  # (b, power_emb)
                    film_embedding_input = current_power_embedding

                    # Also conditioning on current_season
                    season_embedding = uniform(
                        name='season_embedding',
                        shape=[NB_SEASONS, hps('season_emb_size')],
                        scale=1.)
                    current_season_mask = tf.one_hot(current_season,
                                                     NB_SEASONS,
                                                     dtype=tf.float32)
                    current_season_embedding = tf.reduce_sum(
                        season_embedding[None]  # (b,season_emb)
                        * current_season_mask[:, :, None],
                        axis=1)
                    film_embedding_input = tf.concat(
                        [film_embedding_input, current_season_embedding],
                        axis=1)

                    film_output_dims = [hps('gcn_size')] * (
                        hps('nb_graph_conv') - 1) + [hps('attn_size') // 2]

                    # For board_state
                    board_film_weights = tf.layers.Dense(
                        units=2 * sum(film_output_dims),  # (b, 1, 750)
                        use_bias=True,
                        activation=None)(film_embedding_input)[:, None, :]
                    board_film_gammas, board_film_betas = tf.split(
                        board_film_weights, 2, axis=2)  # (b, 1, 750)
                    board_film_gammas = tf.split(board_film_gammas,
                                                 film_output_dims,
                                                 axis=2)
                    board_film_betas = tf.split(board_film_betas,
                                                film_output_dims,
                                                axis=2)

                    # For prev_orders
                    prev_ord_film_weights = tf.layers.Dense(
                        units=2 * sum(film_output_dims),  # (b, 1, 750)
                        use_bias=True,
                        activation=None)(film_embedding_input)[:, None, :]
                    prev_ord_film_weights = tf.tile(
                        prev_ord_film_weights,
                        [NB_PREV_ORDERS, 1, 1])  # (n_pr, 1, 750)
                    prev_ord_film_gammas, prev_ord_film_betas = tf.split(
                        prev_ord_film_weights, 2, axis=2)
                    prev_ord_film_gammas = tf.split(prev_ord_film_gammas,
                                                    film_output_dims,
                                                    axis=2)
                    prev_ord_film_betas = tf.split(prev_ord_film_betas,
                                                   film_output_dims,
                                                   axis=2)

                    # Storing as temporary output
                    self.add_output('_board_state_conv_film_gammas',
                                    board_film_gammas)
                    self.add_output('_board_state_conv_film_betas',
                                    board_film_betas)
                    self.add_output('_prev_orders_conv_film_gammas',
                                    prev_ord_film_gammas)
                    self.add_output('_prev_orders_conv_film_betas',
                                    prev_ord_film_betas)

                # Creating graph convolution
                with tf.variable_scope('graph_conv_scope'):
                    assert hps('nb_graph_conv') >= 2
                    assert hps('attn_size') % 2 == 0

                    # Encoding board state
                    board_state_0yr_conv = self.encode_board(
                        board_state, name='board_state_conv')

                    # Encoding prev_orders
                    prev_orders_state = tf.reshape(prev_orders_state, [
                        batch_size * NB_PREV_ORDERS, NB_NODES,
                        NB_ORDERS_FEATURES
                    ])
                    prev_ord_conv = self.encode_board(prev_orders_state,
                                                      name='prev_orders_conv')

                    # Splitting back into (b, nb_prev, NB_NODES, attn_size // 2)
                    # Reducing the prev ord conv using avg
                    prev_ord_conv = tf.reshape(prev_ord_conv, [
                        batch_size, NB_PREV_ORDERS, NB_NODES,
                        hps('attn_size') // 2
                    ])
                    prev_ord_conv = tf.reduce_mean(prev_ord_conv, axis=1)

                    # Concatenating the current board conv with the prev ord conv
                    # The final board_state_conv should be of dimension (b, NB_NODE, attn_size)
                    board_state_conv = self.get_board_state_conv(
                        board_state_0yr_conv, is_training, prev_ord_conv)

                # Creating order embedding vector (to embed order_ix)
                # Embeddings needs to be cached locally on the worker, otherwise TF can't compute their gradients
                with tf.variable_scope('order_embedding_scope'):
                    # embedding:    (order_vocab_size, 64)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    partitioner = tf.fixed_size_partitioner(
                        NB_PARTITIONS) if hps('use_partitioner') else None
                    order_embedding = uniform(
                        name='order_embedding',
                        shape=[ORDER_VOCABULARY_SIZE,
                               hps('order_emb_size')],
                        scale=1.,
                        partitioner=partitioner,
                        caching_device=caching_device)

                # Creating candidate embedding
                with tf.variable_scope('candidate_embedding_scope'):
                    # embedding:    (order_vocab_size, 64)
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    partitioner = tf.fixed_size_partitioner(
                        NB_PARTITIONS) if hps('use_partitioner') else None
                    candidate_embedding = uniform(
                        name='candidate_embedding',
                        shape=[ORDER_VOCABULARY_SIZE,
                               hps('lstm_size') + 1],
                        scale=1.,
                        partitioner=partitioner,
                        caching_device=caching_device)

                # Trimming to the maximum number of candidates
                candidate_lengths = tf.reduce_sum(
                    to_int32(tf.math.greater(candidates, PAD_ID)),
                    -1)  # int32 - (b,)
                max_candidate_length = tf.math.maximum(
                    1, tf.reduce_max(candidate_lengths))
                candidates = candidates[:, :, :max_candidate_length]

        # Building output tags
        outputs = {
            'batch_size':
            batch_size,
            'board_alignments':
            board_alignments,
            'decoder_inputs':
            decoder_inputs,
            'decoder_type':
            decoder_type,
            'raw_decoder_lengths':
            raw_decoder_lengths,
            'decoder_lengths':
            decoder_lengths,
            'board_state_conv':
            board_state_conv,
            'board_state_0yr_conv':
            board_state_0yr_conv,
            'prev_ord_conv':
            prev_ord_conv,
            'order_embedding':
            order_embedding,
            'candidate_embedding':
            candidate_embedding,
            'candidates':
            candidates,
            'max_candidate_length':
            max_candidate_length,
            'in_retreat_phase':
            tf.math.logical_and(  # 1) board not empty, 2) disl. units present
                tf.reduce_sum(board_state[:], axis=[1, 2]) > 0,
                tf.math.logical_not(
                    to_bool(tf.reduce_min(board_state[:, :, 23], -1))))
        }

        # Adding to graph
        self.add_meta_information(outputs)
Esempio n. 16
0
    def _build_value_final(self):
        """ Builds the value model (final step) """
        from diplomacy_research.utils.tensorflow import tf

        if not self.placeholders:
            self.placeholders = self.get_placeholders()
        else:
            self.placeholders.update(self.get_placeholders())

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]
        relu = tf.nn.relu

        # Training loop
        with tf.variable_scope('value', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Outputs from the policy model
                assert 'rnn_states' in self.outputs

                # Inputs and Features
                rnn_states = self.outputs['rnn_states']
                current_power = self.features[
                    'current_power']  # tf.int32   - (b,)
                value_target = self.features[
                    'value_target']  # tf.float32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Computing the value
                value_h0 = tf.stop_gradient(rnn_states) if hps(
                    'stop_gradient_value') else rnn_states
                value_h0_pos_0 = value_h0[:, 0, :]  # (b, lstm_size)

                # Linear with relu
                # Then linear without relu
                value_h1_pos_0 = tf.layers.Dense(
                    units=hps('value_h1_size'),  # (b, 256)
                    use_bias=True,
                    activation=relu)(value_h0_pos_0)
                value_h2_pos_0 = tf.layers.Dense(
                    units=NB_POWERS,  # (b, 7)
                    use_bias=True,
                    activation=None)(value_h1_pos_0)

                # Computing for the current power
                current_power_mask = tf.one_hot(current_power,
                                                NB_POWERS,
                                                dtype=tf.float32)
                state_value = tf.reduce_sum(current_power_mask *
                                            value_h2_pos_0,
                                            axis=-1)  # (b,)

                # Computing value loss
                with tf.variable_scope('value_loss'):
                    value_loss = tf.reduce_mean(
                        tf.square(value_target - state_value))
                    value_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(value_loss),  # pylint: disable=cell-var-from-loop
                        lambda: value_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/value/v003_rnn_step_0': True,
            'state_value': state_value,
            'value_loss': value_loss
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 17
0
    def _build_policy_final(self):
        """ Builds the policy model (final step) """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.attention import StaticAttentionWrapper
        from diplomacy_research.models.layers.beam_decoder import DiverseBeamSearchDecoder
        from diplomacy_research.models.layers.decoder import CandidateBasicDecoder
        from diplomacy_research.models.layers.dropout import SeededDropoutWrapper
        from diplomacy_research.models.layers.dynamic_decode import dynamic_decode
        from diplomacy_research.models.layers.initializers import uniform
        from diplomacy_research.models.layers.transformer import TransformerCell
        from diplomacy_research.models.layers.wrappers import IdentityCell
        from diplomacy_research.models.policy.order_based.helper import CustomHelper, CustomBeamHelper
        from diplomacy_research.utils.tensorflow import cross_entropy, sequence_loss, to_int32, to_float, get_tile_beam

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        pholder = lambda placeholder_name: self.placeholders[placeholder_name]

        # Training loop
        with tf.variable_scope('policy', reuse=tf.AUTO_REUSE):
            with tf.device(self.cluster_config.worker_device if self.
                           cluster_config else None):

                # Features
                player_seeds = self.features['player_seed']  # tf.int32 - (b,)
                temperature = self.features['temperature']  # tf,flt32 - (b,)
                dropout_rates = self.features[
                    'dropout_rate']  # tf.flt32 - (b,)

                # Placeholders
                stop_gradient_all = pholder('stop_gradient_all')

                # Outputs (from initial steps)
                batch_size = self.outputs['batch_size']
                board_alignments = self.outputs['board_alignments']
                decoder_inputs = self.outputs['decoder_inputs']
                decoder_type = self.outputs['decoder_type']
                raw_decoder_lengths = self.outputs['raw_decoder_lengths']
                decoder_lengths = self.outputs['decoder_lengths']
                board_state_conv = self.outputs['board_state_conv']
                order_embedding = self.outputs['order_embedding']
                candidate_embedding = self.outputs['candidate_embedding']
                candidates = self.outputs['candidates']
                max_candidate_length = self.outputs['max_candidate_length']

                # Creating a smaller position embedding if it's not present in the outputs
                # Embeddings needs to be cached locally on the worker, otherwise TF can't compute their gradients
                with tf.variable_scope('position_embedding_scope'):
                    caching_device = self.cluster_config.caching_device if self.cluster_config else None
                    position_embedding = uniform(
                        name='position_embedding',
                        shape=[NB_SUPPLY_CENTERS,
                               hps('trsf_emb_size')],
                        scale=1.,
                        caching_device=caching_device)

                # Past Attentions
                past_attentions, message_lengths = None, None

                # --- Decoding ---
                with tf.variable_scope('decoder_scope', reuse=tf.AUTO_REUSE):
                    feeder_cell = IdentityCell(
                        output_size=hps('trsf_emb_size') + hps('attn_size'))

                    # ======== Regular Decoding ========
                    # Applying Dropout to input, attention and output
                    feeder_cell = SeededDropoutWrapper(
                        cell=feeder_cell,
                        seeds=player_seeds,
                        input_keep_probs=1. - dropout_rates,
                        variational_recurrent=hps('use_v_dropout'),
                        input_size=hps('trsf_emb_size') + hps('attn_size'),
                        dtype=tf.float32)

                    # Apply attention over orderable location at each position
                    feeder_cell = StaticAttentionWrapper(
                        cell=feeder_cell,
                        memory=board_state_conv,
                        alignments=board_alignments,
                        sequence_length=raw_decoder_lengths,
                        output_attention=False)

                    # Setting initial state
                    feeder_cell_init_state = feeder_cell.zero_state(
                        batch_size, tf.float32)

                    # ---- Helper ----
                    helper = CustomHelper(
                        decoder_type=decoder_type,
                        inputs=decoder_inputs[:, :-1],
                        order_embedding=order_embedding,
                        candidate_embedding=candidate_embedding,
                        sequence_length=decoder_lengths,
                        candidates=candidates,
                        time_major=False,
                        softmax_temperature=temperature)

                    # ---- Transformer Cell ----
                    trsf_scope = tf.VariableScope(
                        name='policy/training_scope/transformer', reuse=False)
                    transformer_cell = TransformerCell(
                        nb_layers=hps('trsf_nb_layers'),
                        nb_heads=hps('trsf_nb_heads'),
                        word_embedding=order_embedding,
                        position_embedding=position_embedding,
                        batch_size=batch_size,
                        feeder_cell=feeder_cell,
                        feeder_init_state=feeder_cell_init_state,
                        past_attentions=past_attentions,
                        past_seq_lengths=message_lengths,
                        scope=trsf_scope,
                        name='transformer')
                    transformer_cell_init_state = transformer_cell.zero_state(
                        batch_size, tf.float32)

                    # ---- Invariants ----
                    invariants_map = {
                        'past_attentions':
                        tf.TensorShape([
                            None,  # batch size
                            hps('trsf_nb_layers'),  # nb_layers
                            2,  # key, value
                            hps('trsf_nb_heads'),  # nb heads
                            None,  # Seq len
                            hps('trsf_emb_size') // hps('trsf_nb_heads')
                        ])
                    }  # Head size

                    # ---- Decoder ----
                    sequence_mask = tf.sequence_mask(
                        raw_decoder_lengths,
                        maxlen=tf.reduce_max(decoder_lengths),
                        dtype=tf.float32)
                    maximum_iterations = NB_SUPPLY_CENTERS
                    model_decoder = CandidateBasicDecoder(
                        cell=transformer_cell,
                        helper=helper,
                        initial_state=transformer_cell_init_state,
                        max_candidate_length=max_candidate_length,
                        extract_state=True)
                    training_results, _, _ = dynamic_decode(
                        decoder=model_decoder,
                        output_time_major=False,
                        maximum_iterations=maximum_iterations,
                        invariants_map=invariants_map,
                        swap_memory=hps('swap_memory'))
                    global_vars_after_decoder = set(tf.global_variables())

                    # ======== Beam Search Decoding ========
                    tile_beam = get_tile_beam(hps('beam_width'))
                    beam_feeder_cell = IdentityCell(
                        output_size=hps('trsf_emb_size') + hps('attn_size'))

                    # Applying Dropout to input, attention and output
                    beam_feeder_cell = SeededDropoutWrapper(
                        cell=beam_feeder_cell,
                        seeds=tile_beam(player_seeds),
                        input_keep_probs=tile_beam(1. - dropout_rates),
                        variational_recurrent=hps('use_v_dropout'),
                        input_size=hps('trsf_emb_size') + hps('attn_size'),
                        dtype=tf.float32)

                    # Apply attention over orderable location at each position
                    beam_feeder_cell = StaticAttentionWrapper(
                        cell=beam_feeder_cell,
                        memory=tile_beam(board_state_conv),
                        alignments=tile_beam(board_alignments),
                        sequence_length=tile_beam(raw_decoder_lengths),
                        output_attention=False)

                    # Setting initial state
                    beam_feeder_init_state = beam_feeder_cell.zero_state(
                        batch_size * hps('beam_width'), tf.float32)

                    # ---- Transformer Cell ----
                    trsf_scope = tf.VariableScope(
                        name='policy/training_scope/transformer', reuse=True)
                    beam_trsf_cell = TransformerCell(
                        nb_layers=hps('trsf_nb_layers'),
                        nb_heads=hps('trsf_nb_heads'),
                        word_embedding=order_embedding,
                        position_embedding=position_embedding,
                        batch_size=batch_size * hps('beam_width'),
                        feeder_cell=beam_feeder_cell,
                        feeder_init_state=beam_feeder_init_state,
                        past_attentions=tile_beam(past_attentions),
                        past_seq_lengths=tile_beam(message_lengths),
                        scope=trsf_scope,
                        name='transformer')
                    beam_trsf_cell_init_state = beam_trsf_cell.zero_state(
                        batch_size * hps('beam_width'), tf.float32)

                    # ---- Beam Helper and Decoder ----
                    beam_helper = CustomBeamHelper(
                        cell=beam_trsf_cell,
                        order_embedding=order_embedding,
                        candidate_embedding=candidate_embedding,
                        candidates=candidates,
                        sequence_length=decoder_lengths,
                        initial_state=beam_trsf_cell_init_state,
                        beam_width=hps('beam_width'))
                    beam_decoder = DiverseBeamSearchDecoder(
                        beam_helper=beam_helper,
                        sequence_length=decoder_lengths,
                        nb_groups=hps('beam_groups'))
                    beam_results, beam_state, _ = dynamic_decode(
                        decoder=beam_decoder,
                        output_time_major=False,
                        maximum_iterations=maximum_iterations,
                        invariants_map=invariants_map,
                        swap_memory=hps('swap_memory'))

                    # Making sure we haven't created new global variables
                    assert not set(
                        tf.global_variables()
                    ) - global_vars_after_decoder, 'New global vars were created'

                    # Processing results
                    candidate_logits = training_results.rnn_output  # (b, dec_len, max_cand_len)
                    logits_length = tf.shape(candidate_logits)[1]  # dec_len
                    decoder_target = decoder_inputs[:, 1:1 + logits_length]

                    # Selected tokens are the token that was actually fed at the next position
                    sample_mask = to_float(
                        tf.math.equal(training_results.sample_id, -1))
                    selected_tokens = to_int32(
                        sequence_mask *
                        (sample_mask * to_float(decoder_target) +
                         (1. - sample_mask) *
                         to_float(training_results.sample_id)))

                    # Computing ArgMax tokens
                    argmax_id = to_int32(tf.argmax(candidate_logits, axis=-1))
                    max_nb_candidate = tf.shape(candidate_logits)[2]
                    candidate_ids = \
                        tf.reduce_sum(tf.one_hot(argmax_id, max_nb_candidate, dtype=tf.int32) * candidates, axis=-1)
                    argmax_tokens = to_int32(
                        to_float(candidate_ids) * sequence_mask)

                    # Extracting the position of the target candidate
                    tokens_labels = tf.argmax(
                        to_int32(
                            tf.math.equal(selected_tokens[:, :, None],
                                          candidates)), -1)
                    target_labels = tf.argmax(
                        to_int32(
                            tf.math.equal(decoder_target[:, :, None],
                                          candidates)), -1)

                    # Log Probs
                    log_probs = -1. * cross_entropy(
                        logits=candidate_logits,
                        labels=tokens_labels) * sequence_mask

                # Computing policy loss
                with tf.variable_scope('policy_loss'):
                    policy_loss = sequence_loss(logits=candidate_logits,
                                                targets=target_labels,
                                                weights=sequence_mask,
                                                average_across_batch=True,
                                                average_across_timesteps=True)
                    policy_loss = tf.cond(
                        stop_gradient_all,
                        lambda: tf.stop_gradient(policy_loss),  # pylint: disable=cell-var-from-loop
                        lambda: policy_loss)  # pylint: disable=cell-var-from-loop

        # Building output tags
        outputs = {
            'tag/policy/order_based/v015_film_transformer_gpt':
            True,
            'targets':
            decoder_inputs[:, 1:],
            'selected_tokens':
            selected_tokens,
            'argmax_tokens':
            argmax_tokens,
            'logits':
            candidate_logits,
            'log_probs':
            log_probs,
            'beam_tokens':
            tf.transpose(beam_results.predicted_ids,
                         perm=[0, 2, 1]),  # [batch, beam, steps]
            'beam_log_probs':
            beam_state.log_probs,
            'rnn_states':
            training_results.rnn_state,
            'policy_loss':
            policy_loss,
            'draw_prob':
            self.outputs.get('draw_prob',
                             tf.zeros_like(self.features['draw_target'])),
            'learning_rate':
            self.learning_rate
        }

        # Adding features, placeholders and outputs to graph
        self.add_meta_information(outputs)
Esempio n. 18
0
    def _get_board_value(self,
                         board_state,
                         current_power,
                         name='board_state_value',
                         reuse=None):
        """ Computes the estimated value of a board state
            :param board_state: The board state - (batch, NB_NODES, NB_FEATURES)
            :param current_power: The power for which we want the board value - (batch,)
            :param name: The name to use for the operaton
            :param reuse: Whether to reuse or not the weights from another operation
            :return: The value of the board state for the specified power - (batch,)
        """
        from diplomacy_research.utils.tensorflow import tf
        from diplomacy_research.models.layers.graph_convolution import GraphConvolution, preprocess_adjacency

        # Quick function to retrieve hparams and placeholders and function shorthands
        hps = lambda hparam_name: self.hparams[hparam_name]
        relu = tf.nn.relu

        # Computing norm adjacency
        norm_adjacency = preprocess_adjacency(get_adjacency_matrix())
        norm_adjacency = tf.tile(tf.expand_dims(norm_adjacency, axis=0),
                                 [tf.shape(board_state)[0], 1, 1])

        # Building scope
        # No need to use 'stop_gradient_value' - Because this model does not share parameters.
        scope = tf.VariableScope(name='value/%s' % name, reuse=reuse)
        with tf.variable_scope(scope):

            with tf.variable_scope('graph_conv_scope'):
                graph_conv = board_state  # (b, NB_NODES, NB_FEAT)
                graph_conv = GraphConvolution(
                    input_dim=graph_conv.shape[-1].
                    value,  # (b, NB_NODES, gcn_1)
                    output_dim=hps('value_gcn_1_output_size'),
                    norm_adjacency=norm_adjacency,
                    activation_fn=relu,
                    bias=True)(graph_conv)
                flat_graph_conv = tf.reshape(
                    graph_conv,
                    shape=[-1, NB_NODES * hps('value_gcn_1_output_size')])
                flat_graph_conv = tf.layers.Dense(
                    units=hps('value_embedding_size'),
                    activation=relu,
                    use_bias=True)(flat_graph_conv)  # (b, value_emb_size)

            with tf.variable_scope('value_scope'):
                current_power_mask = tf.one_hot(current_power,
                                                NB_POWERS,
                                                dtype=tf.float32)
                state_value = flat_graph_conv  # (b, value_emb_size)
                state_value = tf.layers.Dense(
                    units=hps('value_h1_size'),  # (b, value_h1_size)
                    activation=relu,
                    use_bias=True)(state_value)
                state_value = tf.layers.Dense(
                    units=hps('value_h2_size'),  # (b, value_h2_size)
                    activation=relu,
                    use_bias=True)(state_value)
                state_value = tf.layers.Dense(
                    units=NB_POWERS,  # (b, NB_POWERS)
                    activation=None,
                    use_bias=True)(state_value)
                state_value = tf.reduce_sum(state_value * current_power_mask,
                                            axis=1)  # (b,)

        # Returning
        return state_value