def __init__(self, input_size, hidden_size, heads, queries_dropout=0., keys_dropout=0., values_dropout=0., causal=True, num_pos=1, **kwargs): """Creates a Transformer encoder layer by applying a multi head self attention layer Arguments: input_size: int the number of units in the input tensor to this layer also the output size of the model hidden_size: int the number of units in the hidden variables used in each multi head attention layer heads: int the number of heads in each multi head attention layer a good default is 4 or 8 queries_dropout: float the ratio of units to drop during training to the number of units in each attention layer keys_dropout: float the ratio of units to drop during training to the number of units in each attention layer values_dropout: float the ratio of units to drop during training to the number of units in each attention layer causal: bool specifies is the transformer should decoding using a causal mask to preserve the auto regressive property num_pos: int number of relative position""" super(EncoderWithPositionLayer, self).__init__() # the core attention and processing variables self.block0 = Block(hidden_size, input_size * 3, **kwargs) self.pos_embedding = tf.keras.layers.Dense(input_size, **kwargs) self.attention = AttentionWithBias(queries_dropout=queries_dropout, keys_dropout=keys_dropout, values_dropout=values_dropout, causal=causal) self.block1 = Block(hidden_size, input_size, **kwargs) # these parameters need to be stored so that # tf.layers.model.save_model works self.input_size = input_size self.hidden_size = hidden_size self.heads = heads self.queries_dropout = queries_dropout self.keys_dropout = keys_dropout self.values_dropout = values_dropout self.causal = causal self.num_pos = num_pos self.kwargs = kwargs
def __init__(self, hidden_size, heads, queries_dropout=0., keys_dropout=0., temperature=1.0, use_gumbel_noise=True, **kwargs): """Creates a Transformer permutation layer by applying a multi head sequence to matrix layer; and then create hard permutation through Plackett-Luce distribution Arguments: hidden_size: int the number of units in the hidden variables used in each multi head attention layer heads: int the number of heads in each multi head attention layer a good default is 4 or 8 queries_dropout: float the ratio of units to drop during training to the number of units in each attention layer keys_dropout: float the ratio of units to drop during training to the number of units in each attention layer temperature: float a positive number to divide the permutation logits by prior use_gumbel_noise: bool, UNUSED whether to apply gumbel noise to the output of PermutationLayer""" super(PermutationPlackettLayer, self).__init__() # the core attention and processing variables self.block0 = Block(hidden_size // 2, 1, **kwargs) # these parameters need to be stored so that # tf.layers.model.save_model works self.hidden_size = hidden_size self.heads = heads self.queries_dropout = queries_dropout self.keys_dropout = keys_dropout self.temperature = temperature self.use_gumbel_noise = use_gumbel_noise self.kwargs = kwargs
def __init__(self, hidden_size, output_size, causal=True, logits_per_slot=1, **kwargs): """Creates a pointer network using the first operation in the self attention mechanism Arguments: hidden_size: int the number of hidden units in the network blocks used by this layer output_size: int the number of output units used by the network blocks used by this layer causal: bool specifies is the transformer should decoding using a causal mask to preserve the auto regressive property logits_per_slot: int specifies the number of logits per element the pointer network attends to; default is 1""" super(Pointer, self).__init__() # the core processing variables self.block = Block(hidden_size, output_size * (1 + logits_per_slot), **kwargs) # these parameters need to be stored so that # tf.layers.model.save_model works self.hidden_size = hidden_size self.output_size = output_size self.causal = causal self.logits_per_slot = logits_per_slot self.kwargs = kwargs
def __init__(self, input_size, hidden_size, heads, queries_dropout=0., keys_dropout=0., values_dropout=0., causal=True, **kwargs): """ Relative positional attention as in https://arxiv.org/pdf/1901.02860.pdf Besides applying this layer to the Permutation Transformer's encoder, we could possibly apply this to the Transformer-INDIGO's encoder; for simplicity, we did not discover this option in our paper Arguments: input_size: int the number of units in the input tensor to this layer also the output size of the model hidden_size: int the number of units in the hidden variables used in each multi head attention layer heads: int the number of heads in each multi head attention layer a good default is 4 or 8 queries_dropout: float the ratio of units to drop during training to the number of units in each attention layer keys_dropout: float the ratio of units to drop during training to the number of units in each attention layer values_dropout: float the ratio of units to drop during training to the number of units in each attention layer causal: bool specifies is the transformer should decoding using a causal mask to preserve the auto regressive property""" super(EncoderWithPositionLayer, self).__init__() # the core attention and processing variables #self.block0 = Block(hidden_size, input_size * 3, **kwargs) self.relative_length = 100 self.relative_encoding = position_encoding_relative( self.relative_length, input_size) # (range(-100,100), input_size) self.block0 = Block(hidden_size, input_size, lastfc=False, **kwargs) self.attbias0 = tf.keras.layers.Dense(heads, activation=None, **kwargs) self.attbias1 = tf.keras.layers.Dense(heads, activation=None, **kwargs) self.q0 = tf.keras.layers.Dense(input_size, activation=None, **kwargs) self.wke0 = tf.keras.layers.Dense(input_size, activation=None, **kwargs) self.wkv0 = tf.keras.layers.Dense(input_size, activation=None, **kwargs) self.wkr0 = tf.keras.layers.Dense(input_size, activation=None, **kwargs) self.attention = AttentionWithBias(queries_dropout=queries_dropout, keys_dropout=keys_dropout, values_dropout=values_dropout, causal=causal) self.block1 = Block(hidden_size, input_size, **kwargs) # these parameters need to be stored so that # tf.layers.model.save_model works self.input_size = input_size self.hidden_size = hidden_size self.heads = heads self.queries_dropout = queries_dropout self.keys_dropout = keys_dropout self.values_dropout = values_dropout self.causal = causal self.kwargs = kwargs