def build_ac(): # make body with tf.variable_scope('body'): x = tfc_layers.fully_connected(X, nc.fc_ch_dim, scope='fc0') hs = None if nc.use_lstm: with tf.variable_scope('lstm_embed'): x, hs = tp_layers.lstm_embed_block(inputs_x=x, inputs_hs=inputs.S, inputs_mask=inputs.M, nc=nc) # make action head with tf.variable_scope('action', reuse=tf.AUTO_REUSE): size = ac_space.shape[0] mean = tfc_layers.fully_connected( x, size, activation_fn=tf.tanh, normalizer_fn=None, scope='mean', ) mean = (ac_space.high + ac_space.low) * 0.5 + mean * ( ac_space.high - ac_space.low) * 0.5 logstd = tf.get_variable(name='logstd', shape=[1, size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) head = tp_layers.to_action_head(pdparam, DiagGaussianPdType) head = head._replace(sam=tf.clip_by_value( head.sam, ac_space.low, ac_space.high)) # make value head self_vf = None outer_vf = None if nc.use_value_head: with tf.variable_scope('vf'): self_vf = tfc_layers.fully_connected( tf.concat([X, head.argmax], axis=-1), nc.fc_ch_dim) self_vf = tfc_layers.fully_connected(self_vf, nc.n_v, activation_fn=None, normalizer_fn=None) with tf.variable_scope('vf', reuse=tf.AUTO_REUSE): outer_vf = tfc_layers.fully_connected( tf.concat([X, inputs.A], axis=-1), nc.fc_ch_dim) outer_vf = tfc_layers.fully_connected(outer_vf, nc.n_v, activation_fn=None, normalizer_fn=None) return head, self_vf, outer_vf, hs, logstd
def mnet_v6d6_heads(inputs: MNetV6Inputs, inputs_embed: MNetV6Embed, embed_sc: MNetV6EmbedScope, consts: MNetV6Consts, coord_sys, nc: MNetV6Config, scope=None): # shorter names inputs_obs, inputs_act = inputs.X, getattr(inputs, 'A', None) embed = inputs_embed with tf.variable_scope(scope, default_name='mnet_v6d6_heads'): # use or create scalar_context if embed.vec_embed.ab_mask_embed is None: scalar_context = tfc_layers.fully_connected( tp_ops.to_float32(inputs_obs['MASK_AB']), 64) else: scalar_context = embed.vec_embed.ab_mask_embed # update scalar_context scalar_context = tf.concat([scalar_context, embed.zstat_embed], axis=-1) # make ability action head: level 1 with tf.variable_scope('ability'): # create embeddings for the action heads if nc.embed_for_action_heads == 'int': emb_for_heads = embed.int_embed elif nc.embed_for_action_heads == 'lstm': emb_for_heads = embed.lstm_embed else: raise NotImplementedError( 'Unknown nc.embed_for_action_heads {}'.format( nc.embed_for_action_heads)) # NOTE: comparable to v5, use layer_norm o = _pre_discrete_action_res_block(emb_for_heads, nc.enc_dim, n_blk=nc.ab_n_blk, n_skip=nc.ab_n_skip) if nc.use_astar_glu: ab_head = tp_layers.discrete_action_head_v2( inputs=o, n_actions=nc.ab_dim, pdtype_cls=CategoricalPdType, context=scalar_context, mask=inputs_obs[ 'MASK_AB'], # fine to pass again for hard masking temperature=nc.temperature, scope='action_head') else: ab_head = tp_layers.discrete_action_head( inputs=o, n_actions=nc.ab_dim, enc_dim=nc.enc_dim, pdtype_cls=CategoricalPdType, mask=inputs_obs['MASK_AB'], embed_scope=None, temperature=nc.temperature, scope='action_head') # make noop action head: auto-reg level 2 ab_taken = (inputs_act['A_AB'] if inputs_act is not None else ab_head.sam) mw = _action_mask_weights(inputs_ab=ab_taken, inputs_arg_mask=consts.arg_mask, weights_include_ab=True) structured_mw = tp_utils.pack_sequence_as_structure_like_gym_space( nc.ac_space, mw) ab_taken_embed = tp_layers.linear_embed(ab_taken, vocab_size=nc.ab_dim, enc_size=nc.enc_dim, scope=embed_sc.ab_embed_sc) if nc.use_astar_glu: # create regressive embeddings gated on scalar_context reg_embed = tp_layers.glu(emb_for_heads, scalar_context, 1024) reg_embed += tp_layers.glu(ab_taken_embed, scalar_context, 1024) else: reg_embed = tfc_layers.fully_connected(emb_for_heads, 1024) reg_embed += tfc_layers.fully_connected(ab_taken_embed, 1024) # smoothing discrete head for noop with tf.variable_scope('noop_num'): # NOTE: comparable to v5, use bottleneck noop_logits = _pre_discrete_action_fc_block(inputs=reg_embed, n_actions=nc.noop_dim, enc_dim=nc.enc_dim, n_blk=2) noop_head = tp_layers.to_action_head(noop_logits, CategoricalPdType) # make shift action head: auto-reg level 3 noop_taken = (inputs_act['A_NOOP_NUM'] if inputs_act is not None else noop_head.sam) noop_taken_embed = tp_layers.linear_embed( noop_taken, vocab_size=nc.noop_dim, enc_size=nc.enc_dim, scope=embed_sc.noop_num_embed_sc) # reg_embed = tf.concat([reg_embed, noop_taken_embed], axis=-1) reg_embed += tfc_layers.fully_connected(noop_taken_embed, 1024) with tf.variable_scope('shift'): o = _pre_discrete_action_res_block(reg_embed, nc.enc_dim, n_blk=1, n_skip=2) sft_head = tp_layers.discrete_action_head( inputs=o, n_actions=nc.shift_dim, enc_dim=nc.enc_dim, pdtype_cls=CategoricalPdType, embed_scope=None, temperature=nc.temperature, scope='shift_head') # make selection action head: auto-reg level 4 sft_taken = (inputs_act['A_SHIFT'] if inputs_act is not None else sft_head.sam) # sft_taken_embed = tp_ops.to_float32(tf.expand_dims(sft_taken, axis=-1)) # reg_embed = tf.concat([reg_embed, sft_taken_embed], axis=-1) sft_taken_embed = tp_layers.linear_embed(sft_taken, vocab_size=2, enc_size=1024, scope="sft_embed") reg_embed += sft_taken_embed # create func embed if nc.use_astar_func_embed: with tf.variable_scope('func_embed', reuse=tf.AUTO_REUSE) as func_embed_sc: pass # selection func_embed per AStar select_func_embed = tf.nn.embedding_lookup( consts.select_type_func_mask, ab_taken) select_func_embed = tfc_layers.fully_connected( tf.cast(select_func_embed, tf.float32), nc.enc_dim, activation_fn=tf.nn.relu, scope=func_embed_sc) # target unit func_embed per AStar tar_u_func_embed = tf.nn.embedding_lookup( consts.tar_u_type_func_mask, ab_taken) tar_u_func_embed = tfc_layers.fully_connected( tf.cast(tar_u_func_embed, tf.float32), nc.enc_dim, activation_fn=tf.nn.relu, scope=func_embed_sc) with tf.variable_scope('select'): s_mask = fetch_op(inputs_obs['MASK_SELECTION'], ab_taken) s_keys = tfc_layers.fully_connected(embed.units_embed.units_embed, 32, activation_fn=None, scope='selection_raw_keys') # make ground-truth selection labels (if any) selection_labels = (inputs_act['A_SELECT'] if inputs_act is not None else None) # get the head and the updated s_embed if nc.use_astar_func_embed: s_head, reg_embed = tp_layers.sequential_selection_head_v2( inputs=reg_embed, inputs_select_mask=s_mask, input_keys=s_keys, input_selections=selection_labels, input_func_embed=select_func_embed, max_num=64, temperature=nc.temperature, scope='selection_head') else: s_head, reg_embed = tp_layers.sequential_selection_head( inputs=reg_embed, inputs_select_mask=s_mask, input_keys=s_keys, input_selections=selection_labels, max_num=64, temperature=nc.temperature, scope='selection_head') # reg_embed = tf.concat([reg_embed, s_embed], axis=-1) # make cmd_u action head: auto-reg level 5 gathered_reg_embed = reg_embed gathered_units_embed = embed.units_embed.units_embed gathered_map_skip = embed.spa_embed.map_skip with tf.variable_scope("cmd_u"): # NOTE: comparable with v5 ind = None if nc.gather_batch: mask = structured_mw['A_CMD_UNIT'] ind = tf.cast(tf.where(mask), tf.int32)[:, 0] gathered_reg_embed = tf.gather(reg_embed, ind) inputs_ptr_mask = tf.gather_nd( inputs_obs['MASK_CMD_UNIT'], tf.stack([ind, tf.gather(ab_taken, ind)], axis=1)) gathered_units_embed = tf.gather(embed.units_embed.units_embed, ind) if nc.use_astar_func_embed: tar_u_func_embed = tf.gather(tar_u_func_embed, ind) else: inputs_ptr_mask = fetch_op(inputs_obs['MASK_CMD_UNIT'], ab_taken) cmd_u_inputs = _pre_ptr_action_res_block(gathered_reg_embed, nc.enc_dim, n_blk=1, n_skip=2) if nc.use_astar_func_embed: cmd_u_head = tp_layers.ptr_action_head_v2( inputs_query=cmd_u_inputs, inputs_ptr_mask=inputs_ptr_mask, inputs_entity_embed=gathered_units_embed, inputs_func_embed=tar_u_func_embed, ptr_out_dim=nc.tar_unit_dim, pdtype_cls=CategoricalPdType, temperature=nc.temperature, scatter_ind=ind, scatter_bs=nc.batch_size, scope='cmd_u_head') else: cmd_u_head = tp_layers.ptr_action_head( inputs_query=cmd_u_inputs, inputs_ptr_mask=inputs_ptr_mask, inputs_entity_embed=gathered_units_embed, ptr_out_dim=nc.tar_unit_dim, num_dec_blocks=1, ff_dim=nc.enc_dim, enc_dim=nc.enc_dim, pdtype_cls=CategoricalPdType, temperature=nc.temperature, scatter_ind=ind, scatter_bs=nc.batch_size, scope='cmd_u_head') # cmd_pos: auto-reg level 5 ch_dim = nc.spa_ch_dim with tf.variable_scope("pos"): # common pos embedding ind = None if nc.gather_batch: mask = structured_mw['A_CMD_POS'] ind = tf.cast(tf.where(mask), tf.int32)[:, 0] gathered_reg_embed = tf.gather(reg_embed, ind) gathered_map_skip = [ tf.gather(map_skip, ind) for map_skip in embed.spa_embed.map_skip ] loc_masks = tf.gather_nd( inputs_obs['MASK_CMD_POS'], tf.stack([ind, tf.gather(ab_taken, ind)], axis=1)) else: loc_masks = fetch_op(inputs_obs['MASK_CMD_POS'], ab_taken) # pos embedding with shared variables with tf.variable_scope('cmd_pos'): # TODO: Astar-like pos head pos_inputs = _pre_loc_action_astar_like_block_v1( gathered_reg_embed, gathered_map_skip[-1], n_blk=nc.pos_n_blk, n_skip=nc.pos_n_skip) pos_head = tp_layers.loc_action_head( inputs=pos_inputs, mask=loc_masks, pdtype_cls=CategoricalPdType, temperature=nc.temperature, logits_mode=nc.pos_logits_mode, scatter_ind=ind, scatter_bs=nc.batch_size, scope='pos_head') return tp_utils.pack_sequence_as_structure_like_gym_space( nc.ac_space, [ ab_head, noop_head, sft_head, s_head, cmd_u_head, pos_head, ]), structured_mw
def cont_nn(inputs: ContNNInputs, nc: ContNNConfig, scope=None) -> ContNNOutputs: """create the whole net for simple MLPs""" with tf.variable_scope(scope, default_name='soccer') as sc: # NOTE: use name_scope, in case multiple parameter-sharing nets are built net_name_scope = tf.get_default_graph().get_name_scope() endpoints_collections = net_name_scope + '_endpoints' X = inputs.X if nc.n_player == 1: X = (X,) ac_spaces = (nc.ac_space,) else: ac_spaces = tuple(nc.ac_space.spaces) y = [] heads = [] for input, ac_space in zip(X, ac_spaces): with tf.variable_scope('body', reuse=tf.AUTO_REUSE): x = tfc_layers.fully_connected(input, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc1") x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc2") x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc3") x = tfc_layers.fully_connected(x, nc.spa_ch_dim, activation_fn=tf.nn.relu, scope="fc4") y.append(x) # make action head with tf.variable_scope('action', reuse=tf.AUTO_REUSE): pdtype = make_pdtype(ac_space) pdprams = tfc_layers.fully_connected(x, pdtype.param_shape()[0], #ac_space.shape[0], activation_fn=None, normalizer_fn=None, scope='pdprams') head = tp_layers.to_action_head(pdprams, DiagGaussianPdType) heads.append(head) y = tf.concat(y, axis=1) heads = tp_utils.pack_sequence_as_structure_like_gym_space(nc.ac_space, heads) if nc.n_player == 1: heads = heads[0] # make value head vf = None if nc.use_value_head: with tf.variable_scope('vf'): vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4) vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2) vf = tfc_layers.fully_connected(vf, nc.n_v, activation_fn=None, normalizer_fn=None) # make loss loss = None if nc.use_loss_type == 'rl': # regularization loss total_reg_loss = tf.losses.get_regularization_losses(scope=sc.name) with tf.variable_scope('losses'): # ppo loss assert nc.n_player == 1 neglogp = head.pd.neglogp(inputs.A) ppo_loss, value_loss = tp_losses.ppo_loss( neglogp=neglogp, oldneglogp=inputs.neglogp, vpred=vf, R=inputs.R, V=inputs.V, masks=None, reward_weights=None, adv_normalize=True, sync_statistics=nc.sync_statistics ) # entropy loss entropy_loss = head.ent loss_endpoints = {} loss = ContNNLosses( total_reg_loss=total_reg_loss, pg_loss=ppo_loss, value_loss=value_loss, entropy_loss=entropy_loss, loss_endpoints=loss_endpoints ) # collect vars, endpoints, etc. trainable_vars = _make_vars(sc) endpoints = OrderedDict() # TODO return ContNNOutputs( self_fed_heads=heads, outer_fed_heads=heads, loss=loss, vars=trainable_vars, endpoints=endpoints, value_head=vf )
def conv_lstm(inputs: ConvLstmInputs, nc: ConvLstmConfig, scope=None) -> ConvLstmOutputs: """create the whole net for conv-lstm""" with tf.variable_scope(scope, default_name='pommerman') as sc: # NOTE: use name_scope, in case multiple parameter-sharing nets are built net_name_scope = tf.get_default_graph().get_name_scope() endpoints_collections = net_name_scope + '_endpoints' X = inputs.X if nc.n_player == 1: X = (X, ) ac_spaces = (nc.ac_space, ) else: ac_spaces = tuple(nc.ac_space.spaces) S = tf.split(inputs.S, nc.n_player, axis=1) # make body y = [] hs_new = [] heads = [] if nc.use_lstm and nc.n_player > 1: nc.hs_len //= nc.n_player nc.nlstm //= nc.n_player for input, s, ac_space in zip(X, S, ac_spaces): with tf.variable_scope('body', reuse=tf.AUTO_REUSE): x = tfc_layers.conv2d(input[0], nc.spa_ch_dim, [3, 3], scope='conv0') x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 2, [3, 3], scope='conv2') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 2, [5, 5], scope='conv3') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 4, [3, 3], scope='conv4') pos = tf.to_int32(input[1]) ind = tf.concat( [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1) x = tf.gather_nd(x, ind) if nc.use_lstm: with tf.variable_scope('lstm_embed'): x, hs = tp_layers.lstm_embed_block( inputs_x=x, inputs_hs=s, inputs_mask=inputs.M, nc=nc) hs_new.append(hs) y.append(x) # make action head with tf.variable_scope('action', reuse=tf.AUTO_REUSE): head_logits = tfc_layers.fully_connected(x, ac_space.n, activation_fn=None, normalizer_fn=None, scope='logits') if len(input) > 1: head_logits = tp_ops.mask_logits(head_logits, input[2]) head = tp_layers.to_action_head(head_logits, CategoricalPdType) heads.append(head) if nc.use_lstm: hs_new = tf.concat(hs_new, axis=1) if nc.n_player > 1: nc.hs_len *= nc.n_player nc.nlstm *= nc.n_player y = tf.concat(y, axis=1) heads = tp_utils.pack_sequence_as_structure_like_gym_space( nc.ac_space, heads) if nc.n_player == 1: heads = heads[0] # make value head vf = None if nc.use_value_head: assert nc.n_player == 2 with tf.variable_scope('vf'): vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4) vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2) vf = tfc_layers.fully_connected(vf, nc.n_v, activation_fn=None, normalizer_fn=None) # make loss loss = None if nc.use_loss_type in ['rl', 'rl_ppo', 'rl_vtrace']: assert nc.n_player == 2 with tf.variable_scope('losses'): # regularization loss total_reg_loss = tf.losses.get_regularization_losses( scope=sc.name) # entropy loss entropy_loss = nest.map_structure_up_to( ac_spaces, lambda head: tf.reduce_mean(head.ent), heads) # ppo loss neglogp = nest.map_structure_up_to( ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads, inputs.A) loss_endpoints = {} for k, v in enumerate(entropy_loss): loss_endpoints['ent_' + str(k)] = v if nc.use_loss_type == 'rl' or nc.use_loss_type == 'rl_ppo': pg_loss, value_loss = tp_losses.ppo_loss( neglogp=neglogp, oldneglogp=inputs.neglogp, vpred=vf, R=inputs.R, V=inputs.V, masks=None, reward_weights=nc.reward_weights, adv_normalize=True, sync_statistics=nc.sync_statistics) elif nc.use_loss_type == 'rl_vtrace': def _batch_to_TB(tsr): return tf.transpose( tf.reshape(tsr, shape=(nc.nrollout, nc.rollout_len))) lam = tf.convert_to_tensor(nc.lam, tf.float32) vpred_list = [ _batch_to_TB(v) for v in tf.split(vf, nc.n_v, axis=1) ] reward_list = [ _batch_to_TB(r) for r in tf.split(inputs.r, nc.n_v, axis=1) ] discounts = _batch_to_TB(inputs.discount) value_loss = [] for values, rewards in zip(vpred_list, reward_list): value_loss.append( tp_losses.td_lambda(values, rewards, discounts, lam=lam)) value_loss = tf.stack(value_loss) neglogp_list = [ _batch_to_TB(neglogp) for neglogp in nest.flatten(neglogp) ] oldneglogp_list = [ _batch_to_TB(oldneglogp) for oldneglogp in nest.flatten(inputs.neglogp) ] shaped_values = tf.matmul(vf, nc.reward_weights, transpose_b=True) shaped_rewards = tf.matmul(inputs.r, nc.reward_weights, transpose_b=True) values = tf.transpose( tf.reshape(shaped_values, shape=(nc.nrollout, nc.rollout_len))) rewards = tf.transpose( tf.reshape(shaped_rewards, shape=(nc.nrollout, nc.rollout_len))) pg_loss = tf.reduce_sum([ tp_losses.vtrace_loss(neglogp, oldneglogp, None, values, rewards, discounts, 1.0, 1.0) for oldneglogp, neglogp in zip(oldneglogp_list, neglogp_list) ]) upgo_loss = tp_losses.upgo_loss( tf.stack(neglogp_list, axis=-1), tf.stack(oldneglogp_list, axis=-1), None, vpred_list[0], reward_list[0], discounts) loss_endpoints['upgo_loss'] = upgo_loss loss_endpoints['pg_loss'] = pg_loss if len(value_loss.shape) == 0: loss_endpoints['value_loss'] = value_loss else: for i in range(value_loss.shape[0]): loss_endpoints['value_loss_' + str(i)] = value_loss[i] loss = ConvLstmLosses(total_reg_loss=total_reg_loss, pg_loss=pg_loss, value_loss=value_loss, entropy_loss=entropy_loss, loss_endpoints=loss_endpoints) # collect vars, endpoints, etc. trainable_vars = _make_vars(sc) endpoints = OrderedDict() # TODO return ConvLstmOutputs(self_fed_heads=heads, outer_fed_heads=heads, S=hs_new, loss=loss, vars=trainable_vars, endpoints=endpoints, value_head=vf)
def conv_lstm(inputs: ConvLstmInputs, nc: ConvLstmConfig, scope=None) -> ConvLstmOutputs: """create the whole net for conv-lstm""" with tf.variable_scope(scope, default_name='pommerman') as sc: # NOTE: use name_scope, in case multiple parameter-sharing nets are built net_name_scope = tf.get_default_graph().get_name_scope() endpoints_collections = net_name_scope + '_endpoints' X = inputs.X if nc.n_player == 1: X = (X, ) ac_spaces = (nc.ac_space, ) else: ac_spaces = tuple(nc.ac_space.spaces) S = tf.split(inputs.S, nc.n_player, axis=1) # make body y = [] hs_new = [] heads = [] for input, s, ac_space in zip(X, S, ac_spaces): with tf.variable_scope('body', reuse=tf.AUTO_REUSE): x = tfc_layers.conv2d(input[0], nc.spa_ch_dim, [3, 3], scope='conv0') x = tfc_layers.conv2d(x, nc.spa_ch_dim, [5, 5], scope='conv1') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 2, [3, 3], scope='conv2') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 2, [5, 5], scope='conv3') x = tfc_layers.conv2d(x, nc.spa_ch_dim * 4, [3, 3], scope='conv4') pos = tf.to_int32(input[1]) ind = tf.concat( [tf.expand_dims(tf.range(nc.batch_size), 1), pos], axis=1) x = tf.gather_nd(x, ind) if nc.use_lstm: with tf.variable_scope('lstm_embed'): x, hs = _lstm_embed_block(inputs_x=x, inputs_hs=s, inputs_mask=inputs.M, nc=nc) hs_new.append(hs) y.append(x) # make action head with tf.variable_scope('action', reuse=tf.AUTO_REUSE): head_logits = tfc_layers.fully_connected(x, ac_space.n, activation_fn=None, normalizer_fn=None, scope='logits') if len(input) > 1: head_logits = tp_ops.mask_logits(head_logits, input[2]) head = tp_layers.to_action_head(head_logits, CategoricalPdType) heads.append(head) if nc.use_lstm: hs_new = tf.concat(hs_new, axis=1) y = tf.concat(y, axis=1) heads = tp_utils.pack_sequence_as_structure_like_gym_space( nc.ac_space, heads) if nc.n_player == 1: heads = heads[0] # make value head vf = None if nc.use_value_head: with tf.variable_scope('vf'): vf = tfc_layers.fully_connected(y, nc.spa_ch_dim * 4) vf = tfc_layers.fully_connected(vf, nc.spa_ch_dim * 2) vf = tfc_layers.fully_connected(vf, nc.n_v, activation_fn=None, normalizer_fn=None) # make loss loss = None if nc.use_loss_type == 'rl': # regularization loss total_reg_loss = tf.losses.get_regularization_losses(scope=sc.name) with tf.variable_scope('losses'): # ppo loss neglogp = nest.map_structure_up_to( ac_spaces, lambda head, ac: head.pd.neglogp(ac), heads, inputs.A) ppo_loss, value_loss = tp_losses.ppo_loss( neglogp=neglogp, oldneglogp=inputs.neglogp, vpred=vf, R=inputs.R, V=inputs.V, masks=None, reward_weights=None, adv_normalize=True, sync_statistics=nc.sync_statistics) # entropy loss entropy_loss = nest.map_structure_up_to( ac_spaces, lambda head: tf.reduce_mean(head.ent), heads) loss_endpoints = {} loss = ConvLstmLosses(total_reg_loss=total_reg_loss, pg_loss=ppo_loss, value_loss=value_loss, entropy_loss=entropy_loss, loss_endpoints=loss_endpoints) # collect vars, endpoints, etc. trainable_vars = _make_vars(sc) endpoints = OrderedDict() # TODO return ConvLstmOutputs(self_fed_heads=heads, outer_fed_heads=heads, S=hs_new, loss=loss, vars=trainable_vars, endpoints=endpoints, value_head=vf)