Beispiel #1
0
 def build_ac():
     # make body
     with tf.variable_scope('body'):
         x = tfc_layers.fully_connected(X, nc.fc_ch_dim, scope='fc0')
         hs = None
         if nc.use_lstm:
             with tf.variable_scope('lstm_embed'):
                 x, hs = tp_layers.lstm_embed_block(inputs_x=x,
                                                    inputs_hs=inputs.S,
                                                    inputs_mask=inputs.M,
                                                    nc=nc)
         # make action head
         with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
             size = ac_space.shape[0]
             mean = tfc_layers.fully_connected(
                 x,
                 size,
                 activation_fn=tf.tanh,
                 normalizer_fn=None,
                 scope='mean',
             )
             mean = (ac_space.high + ac_space.low) * 0.5 + mean * (
                 ac_space.high - ac_space.low) * 0.5
             logstd = tf.get_variable(name='logstd',
                                      shape=[1, size],
                                      initializer=tf.zeros_initializer())
             pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
             head = tp_layers.to_action_head(pdparam, DiagGaussianPdType)
             head = head._replace(sam=tf.clip_by_value(
                 head.sam, ac_space.low, ac_space.high))
     # make value head
     self_vf = None
     outer_vf = None
     if nc.use_value_head:
         with tf.variable_scope('vf'):
             self_vf = tfc_layers.fully_connected(
                 tf.concat([X, head.argmax], axis=-1), nc.fc_ch_dim)
             self_vf = tfc_layers.fully_connected(self_vf,
                                                  nc.n_v,
                                                  activation_fn=None,
                                                  normalizer_fn=None)
         with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
             outer_vf = tfc_layers.fully_connected(
                 tf.concat([X, inputs.A], axis=-1), nc.fc_ch_dim)
             outer_vf = tfc_layers.fully_connected(outer_vf,
                                                   nc.n_v,
                                                   activation_fn=None,
                                                   normalizer_fn=None)
     return head, self_vf, outer_vf, hs, logstd
Beispiel #2
0
def mnet_v6d6_heads(inputs: MNetV6Inputs,
                    inputs_embed: MNetV6Embed,
                    embed_sc: MNetV6EmbedScope,
                    consts: MNetV6Consts,
                    coord_sys,
                    nc: MNetV6Config,
                    scope=None):
    # shorter names
    inputs_obs, inputs_act = inputs.X, getattr(inputs, 'A', None)
    embed = inputs_embed

    with tf.variable_scope(scope, default_name='mnet_v6d6_heads'):
        # use or create scalar_context
        if embed.vec_embed.ab_mask_embed is None:
            scalar_context = tfc_layers.fully_connected(
                tp_ops.to_float32(inputs_obs['MASK_AB']), 64)
        else:
            scalar_context = embed.vec_embed.ab_mask_embed
        # update scalar_context
        scalar_context = tf.concat([scalar_context, embed.zstat_embed],
                                   axis=-1)

        # make ability action head: level 1
        with tf.variable_scope('ability'):
            # create embeddings for the action heads
            if nc.embed_for_action_heads == 'int':
                emb_for_heads = embed.int_embed
            elif nc.embed_for_action_heads == 'lstm':
                emb_for_heads = embed.lstm_embed
            else:
                raise NotImplementedError(
                    'Unknown nc.embed_for_action_heads {}'.format(
                        nc.embed_for_action_heads))

            # NOTE: comparable to v5, use layer_norm
            o = _pre_discrete_action_res_block(emb_for_heads,
                                               nc.enc_dim,
                                               n_blk=nc.ab_n_blk,
                                               n_skip=nc.ab_n_skip)
            if nc.use_astar_glu:
                ab_head = tp_layers.discrete_action_head_v2(
                    inputs=o,
                    n_actions=nc.ab_dim,
                    pdtype_cls=CategoricalPdType,
                    context=scalar_context,
                    mask=inputs_obs[
                        'MASK_AB'],  # fine to pass again for hard masking
                    temperature=nc.temperature,
                    scope='action_head')
            else:
                ab_head = tp_layers.discrete_action_head(
                    inputs=o,
                    n_actions=nc.ab_dim,
                    enc_dim=nc.enc_dim,
                    pdtype_cls=CategoricalPdType,
                    mask=inputs_obs['MASK_AB'],
                    embed_scope=None,
                    temperature=nc.temperature,
                    scope='action_head')

        # make noop action head: auto-reg level 2
        ab_taken = (inputs_act['A_AB']
                    if inputs_act is not None else ab_head.sam)
        mw = _action_mask_weights(inputs_ab=ab_taken,
                                  inputs_arg_mask=consts.arg_mask,
                                  weights_include_ab=True)
        structured_mw = tp_utils.pack_sequence_as_structure_like_gym_space(
            nc.ac_space, mw)
        ab_taken_embed = tp_layers.linear_embed(ab_taken,
                                                vocab_size=nc.ab_dim,
                                                enc_size=nc.enc_dim,
                                                scope=embed_sc.ab_embed_sc)

        if nc.use_astar_glu:
            # create regressive embeddings gated on scalar_context
            reg_embed = tp_layers.glu(emb_for_heads, scalar_context, 1024)
            reg_embed += tp_layers.glu(ab_taken_embed, scalar_context, 1024)
        else:
            reg_embed = tfc_layers.fully_connected(emb_for_heads, 1024)
            reg_embed += tfc_layers.fully_connected(ab_taken_embed, 1024)

        # smoothing discrete head for noop
        with tf.variable_scope('noop_num'):
            # NOTE: comparable to v5, use bottleneck
            noop_logits = _pre_discrete_action_fc_block(inputs=reg_embed,
                                                        n_actions=nc.noop_dim,
                                                        enc_dim=nc.enc_dim,
                                                        n_blk=2)
            noop_head = tp_layers.to_action_head(noop_logits,
                                                 CategoricalPdType)

        # make shift action head: auto-reg level 3
        noop_taken = (inputs_act['A_NOOP_NUM']
                      if inputs_act is not None else noop_head.sam)
        noop_taken_embed = tp_layers.linear_embed(
            noop_taken,
            vocab_size=nc.noop_dim,
            enc_size=nc.enc_dim,
            scope=embed_sc.noop_num_embed_sc)
        # reg_embed = tf.concat([reg_embed, noop_taken_embed], axis=-1)
        reg_embed += tfc_layers.fully_connected(noop_taken_embed, 1024)
        with tf.variable_scope('shift'):
            o = _pre_discrete_action_res_block(reg_embed,
                                               nc.enc_dim,
                                               n_blk=1,
                                               n_skip=2)
            sft_head = tp_layers.discrete_action_head(
                inputs=o,
                n_actions=nc.shift_dim,
                enc_dim=nc.enc_dim,
                pdtype_cls=CategoricalPdType,
                embed_scope=None,
                temperature=nc.temperature,
                scope='shift_head')

        # make selection action head: auto-reg level 4
        sft_taken = (inputs_act['A_SHIFT']
                     if inputs_act is not None else sft_head.sam)
        # sft_taken_embed = tp_ops.to_float32(tf.expand_dims(sft_taken, axis=-1))
        # reg_embed = tf.concat([reg_embed, sft_taken_embed], axis=-1)
        sft_taken_embed = tp_layers.linear_embed(sft_taken,
                                                 vocab_size=2,
                                                 enc_size=1024,
                                                 scope="sft_embed")
        reg_embed += sft_taken_embed

        # create func embed
        if nc.use_astar_func_embed:
            with tf.variable_scope('func_embed',
                                   reuse=tf.AUTO_REUSE) as func_embed_sc:
                pass
            # selection func_embed per AStar
            select_func_embed = tf.nn.embedding_lookup(
                consts.select_type_func_mask, ab_taken)
            select_func_embed = tfc_layers.fully_connected(
                tf.cast(select_func_embed, tf.float32),
                nc.enc_dim,
                activation_fn=tf.nn.relu,
                scope=func_embed_sc)
            # target unit func_embed per AStar
            tar_u_func_embed = tf.nn.embedding_lookup(
                consts.tar_u_type_func_mask, ab_taken)
            tar_u_func_embed = tfc_layers.fully_connected(
                tf.cast(tar_u_func_embed, tf.float32),
                nc.enc_dim,
                activation_fn=tf.nn.relu,
                scope=func_embed_sc)

        with tf.variable_scope('select'):
            s_mask = fetch_op(inputs_obs['MASK_SELECTION'], ab_taken)
            s_keys = tfc_layers.fully_connected(embed.units_embed.units_embed,
                                                32,
                                                activation_fn=None,
                                                scope='selection_raw_keys')
            # make ground-truth selection labels (if any)
            selection_labels = (inputs_act['A_SELECT']
                                if inputs_act is not None else None)
            # get the head and the updated s_embed
            if nc.use_astar_func_embed:
                s_head, reg_embed = tp_layers.sequential_selection_head_v2(
                    inputs=reg_embed,
                    inputs_select_mask=s_mask,
                    input_keys=s_keys,
                    input_selections=selection_labels,
                    input_func_embed=select_func_embed,
                    max_num=64,
                    temperature=nc.temperature,
                    scope='selection_head')
            else:
                s_head, reg_embed = tp_layers.sequential_selection_head(
                    inputs=reg_embed,
                    inputs_select_mask=s_mask,
                    input_keys=s_keys,
                    input_selections=selection_labels,
                    max_num=64,
                    temperature=nc.temperature,
                    scope='selection_head')
            # reg_embed = tf.concat([reg_embed, s_embed], axis=-1)

        # make cmd_u action head: auto-reg level 5
        gathered_reg_embed = reg_embed
        gathered_units_embed = embed.units_embed.units_embed
        gathered_map_skip = embed.spa_embed.map_skip
        with tf.variable_scope("cmd_u"):
            # NOTE: comparable with v5
            ind = None
            if nc.gather_batch:
                mask = structured_mw['A_CMD_UNIT']
                ind = tf.cast(tf.where(mask), tf.int32)[:, 0]
                gathered_reg_embed = tf.gather(reg_embed, ind)
                inputs_ptr_mask = tf.gather_nd(
                    inputs_obs['MASK_CMD_UNIT'],
                    tf.stack([ind, tf.gather(ab_taken, ind)], axis=1))
                gathered_units_embed = tf.gather(embed.units_embed.units_embed,
                                                 ind)
                if nc.use_astar_func_embed:
                    tar_u_func_embed = tf.gather(tar_u_func_embed, ind)
            else:
                inputs_ptr_mask = fetch_op(inputs_obs['MASK_CMD_UNIT'],
                                           ab_taken)
            cmd_u_inputs = _pre_ptr_action_res_block(gathered_reg_embed,
                                                     nc.enc_dim,
                                                     n_blk=1,
                                                     n_skip=2)
            if nc.use_astar_func_embed:
                cmd_u_head = tp_layers.ptr_action_head_v2(
                    inputs_query=cmd_u_inputs,
                    inputs_ptr_mask=inputs_ptr_mask,
                    inputs_entity_embed=gathered_units_embed,
                    inputs_func_embed=tar_u_func_embed,
                    ptr_out_dim=nc.tar_unit_dim,
                    pdtype_cls=CategoricalPdType,
                    temperature=nc.temperature,
                    scatter_ind=ind,
                    scatter_bs=nc.batch_size,
                    scope='cmd_u_head')
            else:
                cmd_u_head = tp_layers.ptr_action_head(
                    inputs_query=cmd_u_inputs,
                    inputs_ptr_mask=inputs_ptr_mask,
                    inputs_entity_embed=gathered_units_embed,
                    ptr_out_dim=nc.tar_unit_dim,
                    num_dec_blocks=1,
                    ff_dim=nc.enc_dim,
                    enc_dim=nc.enc_dim,
                    pdtype_cls=CategoricalPdType,
                    temperature=nc.temperature,
                    scatter_ind=ind,
                    scatter_bs=nc.batch_size,
                    scope='cmd_u_head')

        # cmd_pos: auto-reg level 5
        ch_dim = nc.spa_ch_dim
        with tf.variable_scope("pos"):
            # common pos embedding
            ind = None
            if nc.gather_batch:
                mask = structured_mw['A_CMD_POS']
                ind = tf.cast(tf.where(mask), tf.int32)[:, 0]
                gathered_reg_embed = tf.gather(reg_embed, ind)
                gathered_map_skip = [
                    tf.gather(map_skip, ind)
                    for map_skip in embed.spa_embed.map_skip
                ]
                loc_masks = tf.gather_nd(
                    inputs_obs['MASK_CMD_POS'],
                    tf.stack([ind, tf.gather(ab_taken, ind)], axis=1))
            else:
                loc_masks = fetch_op(inputs_obs['MASK_CMD_POS'], ab_taken)
            # pos embedding with shared variables
            with tf.variable_scope('cmd_pos'):
                # TODO: Astar-like pos head
                pos_inputs = _pre_loc_action_astar_like_block_v1(
                    gathered_reg_embed,
                    gathered_map_skip[-1],
                    n_blk=nc.pos_n_blk,
                    n_skip=nc.pos_n_skip)
                pos_head = tp_layers.loc_action_head(
                    inputs=pos_inputs,
                    mask=loc_masks,
                    pdtype_cls=CategoricalPdType,
                    temperature=nc.temperature,
                    logits_mode=nc.pos_logits_mode,
                    scatter_ind=ind,
                    scatter_bs=nc.batch_size,
                    scope='pos_head')
    return tp_utils.pack_sequence_as_structure_like_gym_space(
        nc.ac_space, [
            ab_head,
            noop_head,
            sft_head,
            s_head,
            cmd_u_head,
            pos_head,
        ]), structured_mw
Beispiel #3
0
def cont_nn(inputs: ContNNInputs,
              nc: ContNNConfig,
              scope=None) -> ContNNOutputs:
  """create the whole net for simple MLPs"""
  with tf.variable_scope(scope, default_name='soccer') as sc:
    # NOTE: use name_scope, in case multiple parameter-sharing nets are built
    net_name_scope = tf.get_default_graph().get_name_scope()
    endpoints_collections = net_name_scope + '_endpoints'
    X = inputs.X
    if nc.n_player == 1:
      X = (X,)
      ac_spaces = (nc.ac_space,)
    else:
      ac_spaces = tuple(nc.ac_space.spaces)
    y = []
    heads = []
    for input, ac_space in zip(X, ac_spaces):
      with tf.variable_scope('body', reuse=tf.AUTO_REUSE):
        x = tfc_layers.fully_connected(input, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc1")
        x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc2")
        x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc3")
        x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc4")
        y.append(x)

      # make action head
      with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
        pdtype = make_pdtype(ac_space)
        pdprams = tfc_layers.fully_connected(x, pdtype.param_shape()[0], #ac_space.shape[0],
                                             activation_fn=None,
                                             normalizer_fn=None,
                                             scope='pdprams')
        head = tp_layers.to_action_head(pdprams, DiagGaussianPdType)
        heads.append(head)

    y = tf.concat(y, axis=1)
    heads = tp_utils.pack_sequence_as_structure_like_gym_space(nc.ac_space,
                                                               heads)
    if nc.n_player == 1:
      heads = heads[0]                                                     
    # make value head
    vf = None
    if nc.use_value_head:
      with tf.variable_scope('vf'):
        vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4)
        vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2)
        vf = tfc_layers.fully_connected(vf, nc.n_v, activation_fn=None,
                                        normalizer_fn=None)
    # make loss
    loss = None
    if nc.use_loss_type == 'rl':
      # regularization loss
      total_reg_loss = tf.losses.get_regularization_losses(scope=sc.name)
      with tf.variable_scope('losses'):
        # ppo loss
        assert nc.n_player == 1
        neglogp = head.pd.neglogp(inputs.A)
        ppo_loss, value_loss = tp_losses.ppo_loss(
          neglogp=neglogp,
          oldneglogp=inputs.neglogp,
          vpred=vf,
          R=inputs.R,
          V=inputs.V,
          masks=None,
          reward_weights=None,
          adv_normalize=True,
          sync_statistics=nc.sync_statistics
        )
        # entropy loss
        entropy_loss = head.ent
        loss_endpoints = {}
        loss = ContNNLosses(
          total_reg_loss=total_reg_loss,
          pg_loss=ppo_loss,
          value_loss=value_loss,
          entropy_loss=entropy_loss,
          loss_endpoints=loss_endpoints
        )
        # collect vars, endpoints, etc.
    trainable_vars = _make_vars(sc)
    endpoints = OrderedDict()  # TODO
  return ContNNOutputs(
    self_fed_heads=heads,
    outer_fed_heads=heads,
    loss=loss,
    vars=trainable_vars,
    endpoints=endpoints,
    value_head=vf
  )
Beispiel #4
0
def conv_lstm(inputs: ConvLstmInputs,
              nc: ConvLstmConfig,
              scope=None) -> ConvLstmOutputs:
    """create the whole net for conv-lstm"""
    with tf.variable_scope(scope, default_name='pommerman') as sc:
        # NOTE: use name_scope, in case multiple parameter-sharing nets are built
        net_name_scope = tf.get_default_graph().get_name_scope()
        endpoints_collections = net_name_scope + '_endpoints'
        X = inputs.X
        if nc.n_player == 1:
            X = (X, )
            ac_spaces = (nc.ac_space, )
        else:
            ac_spaces = tuple(nc.ac_space.spaces)
        S = tf.split(inputs.S, nc.n_player, axis=1)
        # make body
        y = []
        hs_new = []
        heads = []
        if nc.use_lstm and nc.n_player > 1:
            nc.hs_len //= nc.n_player
            nc.nlstm //= nc.n_player
        for input, s, ac_space in zip(X, S, ac_spaces):
            with tf.variable_scope('body', reuse=tf.AUTO_REUSE):
                x = tfc_layers.conv2d(input[0],
                                      nc.spa_ch_dim, [3, 3],
                                      scope='conv0')
                x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [3, 3],
                                      scope='conv2')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [5, 5],
                                      scope='conv3')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 4, [3, 3],
                                      scope='conv4')
                pos = tf.to_int32(input[1])
                ind = tf.concat(
                    [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1)
                x = tf.gather_nd(x, ind)
                if nc.use_lstm:
                    with tf.variable_scope('lstm_embed'):
                        x, hs = tp_layers.lstm_embed_block(
                            inputs_x=x,
                            inputs_hs=s,
                            inputs_mask=inputs.M,
                            nc=nc)
                        hs_new.append(hs)
                y.append(x)

            # make action head
            with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
                head_logits = tfc_layers.fully_connected(x,
                                                         ac_space.n,
                                                         activation_fn=None,
                                                         normalizer_fn=None,
                                                         scope='logits')
                if len(input) > 1:
                    head_logits = tp_ops.mask_logits(head_logits, input[2])
                head = tp_layers.to_action_head(head_logits, CategoricalPdType)
                heads.append(head)

        if nc.use_lstm:
            hs_new = tf.concat(hs_new, axis=1)
            if nc.n_player > 1:
                nc.hs_len *= nc.n_player
                nc.nlstm *= nc.n_player
        y = tf.concat(y, axis=1)
        heads = tp_utils.pack_sequence_as_structure_like_gym_space(
            nc.ac_space, heads)
        if nc.n_player == 1:
            heads = heads[0]
        # make value head
        vf = None
        if nc.use_value_head:
            assert nc.n_player == 2
            with tf.variable_scope('vf'):
                vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4)
                vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2)
                vf = tfc_layers.fully_connected(vf,
                                                nc.n_v,
                                                activation_fn=None,
                                                normalizer_fn=None)
        # make loss
        loss = None
        if nc.use_loss_type in ['rl', 'rl_ppo', 'rl_vtrace']:
            assert nc.n_player == 2
            with tf.variable_scope('losses'):
                # regularization loss
                total_reg_loss = tf.losses.get_regularization_losses(
                    scope=sc.name)
                # entropy loss
                entropy_loss = nest.map_structure_up_to(
                    ac_spaces, lambda head: tf.reduce_mean(head.ent), heads)
                # ppo loss
                neglogp = nest.map_structure_up_to(
                    ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads,
                    inputs.A)
                loss_endpoints = {}
                for k, v in enumerate(entropy_loss):
                    loss_endpoints['ent_' + str(k)] = v
                if nc.use_loss_type == 'rl' or nc.use_loss_type == 'rl_ppo':
                    pg_loss, value_loss = tp_losses.ppo_loss(
                        neglogp=neglogp,
                        oldneglogp=inputs.neglogp,
                        vpred=vf,
                        R=inputs.R,
                        V=inputs.V,
                        masks=None,
                        reward_weights=nc.reward_weights,
                        adv_normalize=True,
                        sync_statistics=nc.sync_statistics)
                elif nc.use_loss_type == 'rl_vtrace':

                    def _batch_to_TB(tsr):
                        return tf.transpose(
                            tf.reshape(tsr,
                                       shape=(nc.nrollout, nc.rollout_len)))

                    lam = tf.convert_to_tensor(nc.lam, tf.float32)
                    vpred_list = [
                        _batch_to_TB(v) for v in tf.split(vf, nc.n_v, axis=1)
                    ]
                    reward_list = [
                        _batch_to_TB(r)
                        for r in tf.split(inputs.r, nc.n_v, axis=1)
                    ]
                    discounts = _batch_to_TB(inputs.discount)
                    value_loss = []
                    for values, rewards in zip(vpred_list, reward_list):
                        value_loss.append(
                            tp_losses.td_lambda(values,
                                                rewards,
                                                discounts,
                                                lam=lam))
                    value_loss = tf.stack(value_loss)

                    neglogp_list = [
                        _batch_to_TB(neglogp)
                        for neglogp in nest.flatten(neglogp)
                    ]
                    oldneglogp_list = [
                        _batch_to_TB(oldneglogp)
                        for oldneglogp in nest.flatten(inputs.neglogp)
                    ]
                    shaped_values = tf.matmul(vf,
                                              nc.reward_weights,
                                              transpose_b=True)
                    shaped_rewards = tf.matmul(inputs.r,
                                               nc.reward_weights,
                                               transpose_b=True)
                    values = tf.transpose(
                        tf.reshape(shaped_values,
                                   shape=(nc.nrollout, nc.rollout_len)))
                    rewards = tf.transpose(
                        tf.reshape(shaped_rewards,
                                   shape=(nc.nrollout, nc.rollout_len)))
                    pg_loss = tf.reduce_sum([
                        tp_losses.vtrace_loss(neglogp, oldneglogp, None,
                                              values, rewards, discounts, 1.0,
                                              1.0) for oldneglogp, neglogp in
                        zip(oldneglogp_list, neglogp_list)
                    ])
                    upgo_loss = tp_losses.upgo_loss(
                        tf.stack(neglogp_list, axis=-1),
                        tf.stack(oldneglogp_list, axis=-1), None,
                        vpred_list[0], reward_list[0], discounts)
                    loss_endpoints['upgo_loss'] = upgo_loss
                loss_endpoints['pg_loss'] = pg_loss
                if len(value_loss.shape) == 0:
                    loss_endpoints['value_loss'] = value_loss
                else:
                    for i in range(value_loss.shape[0]):
                        loss_endpoints['value_loss_' + str(i)] = value_loss[i]
                loss = ConvLstmLosses(total_reg_loss=total_reg_loss,
                                      pg_loss=pg_loss,
                                      value_loss=value_loss,
                                      entropy_loss=entropy_loss,
                                      loss_endpoints=loss_endpoints)
                # collect vars, endpoints, etc.
        trainable_vars = _make_vars(sc)
        endpoints = OrderedDict()  # TODO
    return ConvLstmOutputs(self_fed_heads=heads,
                           outer_fed_heads=heads,
                           S=hs_new,
                           loss=loss,
                           vars=trainable_vars,
                           endpoints=endpoints,
                           value_head=vf)
Beispiel #5
0
def conv_lstm(inputs: ConvLstmInputs,
              nc: ConvLstmConfig,
              scope=None) -> ConvLstmOutputs:
    """create the whole net for conv-lstm"""
    with tf.variable_scope(scope, default_name='pommerman') as sc:
        # NOTE: use name_scope, in case multiple parameter-sharing nets are built
        net_name_scope = tf.get_default_graph().get_name_scope()
        endpoints_collections = net_name_scope + '_endpoints'
        X = inputs.X
        if nc.n_player == 1:
            X = (X, )
            ac_spaces = (nc.ac_space, )
        else:
            ac_spaces = tuple(nc.ac_space.spaces)
        S = tf.split(inputs.S, nc.n_player, axis=1)
        # make body
        y = []
        hs_new = []
        heads = []
        for input, s, ac_space in zip(X, S, ac_spaces):
            with tf.variable_scope('body', reuse=tf.AUTO_REUSE):
                x = tfc_layers.conv2d(input[0],
                                      nc.spa_ch_dim, [3, 3],
                                      scope='conv0')
                x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [3, 3],
                                      scope='conv2')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 2, [5, 5],
                                      scope='conv3')
                x = tfc_layers.conv2d(x,
                                      nc.spa_ch_dim * 4, [3, 3],
                                      scope='conv4')
                pos = tf.to_int32(input[1])
                ind = tf.concat(
                    [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1)
                x = tf.gather_nd(x, ind)
                if nc.use_lstm:
                    with tf.variable_scope('lstm_embed'):
                        x, hs = _lstm_embed_block(inputs_x=x,
                                                  inputs_hs=s,
                                                  inputs_mask=inputs.M,
                                                  nc=nc)
                        hs_new.append(hs)
                y.append(x)

            # make action head
            with tf.variable_scope('action', reuse=tf.AUTO_REUSE):
                head_logits = tfc_layers.fully_connected(x,
                                                         ac_space.n,
                                                         activation_fn=None,
                                                         normalizer_fn=None,
                                                         scope='logits')
                if len(input) > 1:
                    head_logits = tp_ops.mask_logits(head_logits, input[2])
                head = tp_layers.to_action_head(head_logits, CategoricalPdType)
                heads.append(head)

        if nc.use_lstm:
            hs_new = tf.concat(hs_new, axis=1)
        y = tf.concat(y, axis=1)
        heads = tp_utils.pack_sequence_as_structure_like_gym_space(
            nc.ac_space, heads)
        if nc.n_player == 1:
            heads = heads[0]
        # make value head
        vf = None
        if nc.use_value_head:
            with tf.variable_scope('vf'):
                vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4)
                vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2)
                vf = tfc_layers.fully_connected(vf,
                                                nc.n_v,
                                                activation_fn=None,
                                                normalizer_fn=None)
        # make loss
        loss = None
        if nc.use_loss_type == 'rl':
            # regularization loss
            total_reg_loss = tf.losses.get_regularization_losses(scope=sc.name)
            with tf.variable_scope('losses'):
                # ppo loss
                neglogp = nest.map_structure_up_to(
                    ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads,
                    inputs.A)
                ppo_loss, value_loss = tp_losses.ppo_loss(
                    neglogp=neglogp,
                    oldneglogp=inputs.neglogp,
                    vpred=vf,
                    R=inputs.R,
                    V=inputs.V,
                    masks=None,
                    reward_weights=None,
                    adv_normalize=True,
                    sync_statistics=nc.sync_statistics)
                # entropy loss
                entropy_loss = nest.map_structure_up_to(
                    ac_spaces, lambda head: tf.reduce_mean(head.ent), heads)
                loss_endpoints = {}
                loss = ConvLstmLosses(total_reg_loss=total_reg_loss,
                                      pg_loss=ppo_loss,
                                      value_loss=value_loss,
                                      entropy_loss=entropy_loss,
                                      loss_endpoints=loss_endpoints)
                # collect vars, endpoints, etc.
        trainable_vars = _make_vars(sc)
        endpoints = OrderedDict()  # TODO
    return ConvLstmOutputs(self_fed_heads=heads,
                           outer_fed_heads=heads,
                           S=hs_new,
                           loss=loss,
                           vars=trainable_vars,
                           endpoints=endpoints,
                           value_head=vf)