def __init__(self,
                 input_size,
                 hidden_size,
                 heads,
                 queries_dropout=0.,
                 keys_dropout=0.,
                 values_dropout=0.,
                 causal=True,
                 num_pos=1,
                 **kwargs):
        """Creates a Transformer encoder layer by applying a
        multi head self attention layer

        Arguments:

        input_size: int
            the number of units in the input tensor to this layer
            also the output size of the model
        hidden_size: int
            the number of units in the hidden variables used
            in each multi head attention layer
        heads: int
            the number of heads in each multi head attention layer
            a good default is 4 or 8
        queries_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        keys_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        values_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        causal: bool
            specifies is the transformer should decoding using
            a causal mask to preserve the auto regressive property
        num_pos: int
            number of relative position"""
        super(EncoderWithPositionLayer, self).__init__()

        # the core attention and processing variables
        self.block0 = Block(hidden_size, input_size * 3, **kwargs)
        self.pos_embedding = tf.keras.layers.Dense(input_size, **kwargs)
        self.attention = AttentionWithBias(queries_dropout=queries_dropout,
                                           keys_dropout=keys_dropout,
                                           values_dropout=values_dropout,
                                           causal=causal)
        self.block1 = Block(hidden_size, input_size, **kwargs)

        # these parameters need to be stored so that
        # tf.layers.model.save_model works
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.heads = heads
        self.queries_dropout = queries_dropout
        self.keys_dropout = keys_dropout
        self.values_dropout = values_dropout
        self.causal = causal
        self.num_pos = num_pos
        self.kwargs = kwargs
Esempio n. 2
0
    def __init__(self,
                 hidden_size,
                 heads,
                 queries_dropout=0.,
                 keys_dropout=0.,
                 temperature=1.0,
                 use_gumbel_noise=True,
                 **kwargs):
        """Creates a Transformer permutation layer by applying a multi
        head sequence to matrix layer; and then create hard permutation
        through Plackett-Luce distribution
        Arguments:
        hidden_size: int
            the number of units in the hidden variables used
            in each multi head attention layer
        heads: int
            the number of heads in each multi head attention layer
            a good default is 4 or 8
        queries_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        keys_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        temperature: float
            a positive number to divide the permutation logits by prior
        use_gumbel_noise: bool, UNUSED
            whether to apply gumbel noise to the output of PermutationLayer"""

        super(PermutationPlackettLayer, self).__init__()

        # the core attention and processing variables
        self.block0 = Block(hidden_size // 2, 1, **kwargs)

        # these parameters need to be stored so that
        # tf.layers.model.save_model works
        self.hidden_size = hidden_size
        self.heads = heads
        self.queries_dropout = queries_dropout
        self.keys_dropout = keys_dropout
        self.temperature = temperature
        self.use_gumbel_noise = use_gumbel_noise
        self.kwargs = kwargs
    def __init__(self,
                 hidden_size,
                 output_size,
                 causal=True,
                 logits_per_slot=1,
                 **kwargs):
        """Creates a pointer network using the first operation
        in the self attention mechanism

        Arguments:

        hidden_size: int
            the number of hidden units in the network blocks
            used by this layer
        output_size: int
            the number of output units used by the network blocks
            used by this layer
        causal: bool
            specifies is the transformer should decoding using
            a causal mask to preserve the auto regressive property
        logits_per_slot: int
            specifies the number of logits per element the pointer
            network attends to; default is 1"""
        super(Pointer, self).__init__()

        # the core processing variables
        self.block = Block(hidden_size, output_size * (1 + logits_per_slot),
                           **kwargs)

        # these parameters need to be stored so that
        # tf.layers.model.save_model works
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.causal = causal
        self.logits_per_slot = logits_per_slot
        self.kwargs = kwargs
Esempio n. 4
0
    def __init__(self,
                 input_size,
                 hidden_size,
                 heads,
                 queries_dropout=0.,
                 keys_dropout=0.,
                 values_dropout=0.,
                 causal=True,
                 **kwargs):
        """
        Relative positional attention as in 
        https://arxiv.org/pdf/1901.02860.pdf
        
        Besides applying this layer to the Permutation Transformer's
        encoder, we could possibly apply this to the Transformer-INDIGO's encoder;
        for simplicity, we did not discover this option in our paper

        Arguments:

        input_size: int
            the number of units in the input tensor to this layer
            also the output size of the model
        hidden_size: int
            the number of units in the hidden variables used
            in each multi head attention layer
        heads: int
            the number of heads in each multi head attention layer
            a good default is 4 or 8
        queries_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        keys_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        values_dropout: float
            the ratio of units to drop during training to the
            number of units in each attention layer
        causal: bool
            specifies is the transformer should decoding using
            a causal mask to preserve the auto regressive property"""
        super(EncoderWithPositionLayer, self).__init__()

        # the core attention and processing variables
        #self.block0 = Block(hidden_size, input_size * 3, **kwargs)
        self.relative_length = 100
        self.relative_encoding = position_encoding_relative(
            self.relative_length, input_size)  # (range(-100,100), input_size)
        self.block0 = Block(hidden_size, input_size, lastfc=False, **kwargs)
        self.attbias0 = tf.keras.layers.Dense(heads, activation=None, **kwargs)
        self.attbias1 = tf.keras.layers.Dense(heads, activation=None, **kwargs)
        self.q0 = tf.keras.layers.Dense(input_size, activation=None, **kwargs)
        self.wke0 = tf.keras.layers.Dense(input_size,
                                          activation=None,
                                          **kwargs)
        self.wkv0 = tf.keras.layers.Dense(input_size,
                                          activation=None,
                                          **kwargs)
        self.wkr0 = tf.keras.layers.Dense(input_size,
                                          activation=None,
                                          **kwargs)

        self.attention = AttentionWithBias(queries_dropout=queries_dropout,
                                           keys_dropout=keys_dropout,
                                           values_dropout=values_dropout,
                                           causal=causal)
        self.block1 = Block(hidden_size, input_size, **kwargs)

        # these parameters need to be stored so that
        # tf.layers.model.save_model works
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.heads = heads
        self.queries_dropout = queries_dropout
        self.keys_dropout = keys_dropout
        self.values_dropout = values_dropout
        self.causal = causal
        self.kwargs = kwargs