Beispiel #1
0
 def __init__(self, args):
     super().__init__()
     self.args = args
     cattimes = 3 if args.title else 2
     self.emb = nn.Embedding(args.ntoks, args.hsz)
     self.lstm = nn.LSTMCell(args.hsz * cattimes, args.hsz)
     self.out = nn.Linear(args.hsz * cattimes, args.tgttoks)
     self.le = list_encode(args)
     self.entout = nn.Linear(args.hsz, 1)
     self.switch = nn.Linear(args.hsz * cattimes, 1)
     self.attn = MultiHeadAttention(args.hsz,
                                    args.hsz,
                                    args.hsz,
                                    h=4,
                                    dropout_p=args.drop)
     self.mattn = MatrixAttn(args.hsz * cattimes, args.hsz)
     self.graph = (args.model in ['graph', 'gat', 'gtrans'])
     print(args.model)
     if self.graph:
         self.ge = graph_encode(args)
     if args.plan:
         self.splan = splanner(args)
     if args.title:
         self.tenc = lseq_encode(args, toks=args.ninput)
         self.attn2 = MultiHeadAttention(args.hsz,
                                         args.hsz,
                                         args.hsz,
                                         h=4,
                                         dropout_p=args.drop)
         self.mix = nn.Linear(args.hsz, 1)
Beispiel #2
0
    def __init__(self, **kwargs):
        super(DecoderLayer, self).__init__(**kwargs)

        with self.name_scope():
            self.self_masked_attention = MultiHeadAttention()
            self.context_attention = MultiHeadAttention()
            self.feed_forward = FeedForward()
Beispiel #3
0
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    attention = MultiHeadAttention(d_model, num_heads, name="attention")({
        'query':
        inputs,
        'key':
        inputs,
        'value':
        inputs,
        'mask':
        padding_mask
    })
    attention = tf.keras.layers.Dropout(rate=dropout)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs +
                                                                 attention)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention +
                                                               outputs)

    return tf.keras.Model(inputs=[inputs, padding_mask],
                          outputs=outputs,
                          name=name)
  def __init__(self,args):
    super().__init__()
    self.renc = nn.Embedding(args.rtoks,args.hsz)
    nn.init.xavier_normal_(self.renc.weight)
    #self.gat = StackedSelfAttentionEncoder(args.hsz,args.hsz,args.hsz,args.hsz,args.prop,args.heads,use_positional_encoding=False)

    self.gat = nn.ModuleList([MultiHeadAttention(args.hsz,args.hsz,args.hsz,h=4,dropout_p=args.drop) for _ in range(args.prop)])
    self.gat = nn.ModuleList([Block(args) for _ in range(args.prop)])
    self.prop = args.prop
 def __init__(self,args):
   super().__init__()
   self.attn = MultiHeadAttention(args.hsz,args.hsz,args.hsz,h=4,dropout_p=args.drop)
   self.l1 = nn.Linear(args.hsz,args.hsz*4)
   self.l2 = nn.Linear(args.hsz*4,args.hsz)
   self.ln_1 = nn.LayerNorm(args.hsz)
   self.ln_2 = nn.LayerNorm(args.hsz)
   self.drop = nn.Dropout(args.drop)
   self.act = gelu
Beispiel #6
0
 def __init__(self, d_h, dropout_p):
     super().__init__()
     self.attn = MultiHeadAttention(d_h, d_h, d_h, h=4, dropout_p=dropout_p)
     self.l1 = nn.Linear(d_h, d_h*4)
     self.l2 = nn.Linear(d_h *4, d_h)
     self.ln_1 = nn.LayerNorm(d_h)
     self.ln_2 = nn.LayerNorm(d_h)
     self.drop = nn.Dropout(dropout_p)
     self.act = gelu
 def __init__(self, args):
     super().__init__()
     self.attn = MultiHeadAttention(args,
                                    h=4,
                                    dropout_prob=args.transformer_drop)
     self.linear1 = nn.Linear(args.hsz, args.hsz * 4)
     self.linear2 = nn.Linear(args.hsz * 4, args.hsz)
     self.layer_norm1 = nn.LayerNorm(args.hsz)
     self.layer_norm2 = nn.LayerNorm(args.hsz)
     self.dropout = nn.Dropout(args.drop)
     self.prelu = nn.PReLU(args.hsz * 4)
Beispiel #8
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        cat_times = 3 if args.title else 2
        self.embed = nn.Embedding(args.output_vocab_size, args.hsz)
        self.entity_encoder = EntityEncoder(args)
        self.graph_encoder = GraphEncoder(args)

        if args.title:
            self.title_encoder = LSTMEncoder(args, toks=args.input_vocab_size)
            self.attention_title = MultiHeadAttention(args, h=4, dropout_prob=args.drop)
            # attention_title: computes c_s (decoding-phase context vector for title)

        self.decoder = nn.LSTMCell(args.hsz * cat_times, args.hsz)
        self.attention_graph = MultiHeadAttention(args, h=4, dropout_prob=args.drop)
        # attention_graph: compute c_g (decoding-phase context vector for graph)

        self.mat_attention = SingleHeadAttention(args.hsz * cat_times, args.hsz, args.device)
        self.switch = nn.Linear(args.hsz * cat_times, 1)
        self.out = nn.Linear(args.hsz * cat_times, args.target_vocab_size)
Beispiel #9
0
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")
    look_ahead_mask = tf.keras.Input(shape=(1, None, None),
                                     name="look_ahead_mask")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

    attention1 = MultiHeadAttention(d_model, num_heads,
                                    name="attention_1")(inputs={
                                        'query': inputs,
                                        'key': inputs,
                                        'value': inputs,
                                        'mask': look_ahead_mask
                                    })
    attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 +
                                                                  inputs)

    attention2 = MultiHeadAttention(d_model, num_heads,
                                    name="attention_2")(inputs={
                                        'query': attention1,
                                        'key': enc_outputs,
                                        'value': enc_outputs,
                                        'mask': padding_mask
                                    })
    attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
    attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 +
                                                                  attention1)

    outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2)
    outputs = tf.keras.layers.Dense(units=d_model)(outputs)
    outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs +
                                                               attention2)

    return tf.keras.Model(
        inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
        outputs=outputs,
        name=name)
    def __init__(self, d_model, num_heads, epsilon=1e-6, rate=0.1):
        super(AttentionBlock, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.epsilon = epsilon
        self.rate = rate

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ln1 = tf.keras.layers.LayerNormalization(epsilon=self.epsilon)
        self.ln2 = tf.keras.layers.LayerNormalization(epsilon=self.epsilon)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

        self.ffn = point_wise_feed_forward_network(d_model, d_model)
    def __init__(
        self,
        device: torch.device,
        num_vocabs: int,
        embedding_dim: int = 512,
        hidden_size: int = 512,
        num_layers: int = 2,
        num_head: int = 8,
        max_len: int = 120,
        dropout: float = 0.3,
        rnn_type: str = 'lstm',
        attn_mechanism: str = 'multi_head',
        smoothing: bool = False,
        sos_id: int = 1,
        eos_id: int = 2,
    ) -> None:
        super(Decoder, self).__init__()
        rnn_cell = self.supported_rnns[rnn_type]
        self.embedding = nn.Embedding(num_vocabs, embedding_dim)
        self.rnn = rnn_cell(embedding_dim,
                            hidden_size,
                            num_layers,
                            True,
                            True,
                            dropout,
                            bidirectional=False)
        self.num_vocabs = num_vocabs
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_head = num_head
        self.max_len = max_len
        self.embedding_dropout = nn.Dropout(p=dropout)
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.attn_mechanism = attn_mechanism
        self.device = device
        self.fc = nn.Linear(hidden_size << 1, num_vocabs)

        if self.attn_mechanism == 'location':
            self.attention = LocationAwareAttention(hidden_size, hidden_size,
                                                    smoothing)
        elif self.attn_mechanism == 'scaled_dot':
            self.attention = ScaledDotProductAttention(hidden_size)
        elif self.attn_mechanism == 'multi_head':
            self.attention = MultiHeadAttention(hidden_size, num_head)
Beispiel #12
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.renc = nn.Embedding(args.rtoks, args.hsz)
        nn.init.xavier_normal_(self.renc.weight)

        if args.model == "gat":
            self.gat = nn.ModuleList([
                MultiHeadAttention(args.hsz,
                                   args.hsz,
                                   args.hsz,
                                   h=4,
                                   dropout_p=args.drop)
                for _ in range(args.prop)
            ])
        else:
            self.gat = nn.ModuleList([Block(args) for _ in range(args.prop)])
        self.prop = args.prop
        self.sparse = args.sparse
Beispiel #13
0
def test_multihead_attention():
    with torch.no_grad():
        kq_dim = 4
        v_dim = 8
        num_heads = 16
        hidden_dim = 64

        batch_size = 3
        seq_len = 7
        attention_input = torch.rand((batch_size, seq_len, hidden_dim))

        mha = MultiHeadAttention(hidden_dim,
                                 key_and_query_dim=kq_dim,
                                 value_dim=v_dim,
                                 num_heads=num_heads)
        mha_ouptut = mha.forward(q_hidden_inputs=attention_input,
                                 k_hidden_inputs=attention_input,
                                 v_hidden_inputs=attention_input,
                                 mask=None)
        assert mha_ouptut.size() == attention_input.size()
Beispiel #14
0
def test_multihead_attention_invalid_args():
    with pytest.raises(ValueError):
        MultiHeadAttention(16, key_and_query_dim=4, value_dim=4, num_heads=5)

    with pytest.raises(ValueError):
        SimpleMultiHeadAttention(16, key_query_value_dim=4, num_heads=5)
Beispiel #15
0
    def __init__(self, hidden_dim: int, key_query_value_dim: int = 64, num_attention_heads=8, with_hard_concrete_gate=False):
        super(TransformerEncoderDecoderConnectionBlock, self).__init__()

        self.multihead_attention = MultiHeadAttention(hidden_dim, key_and_query_dim=key_query_value_dim, value_dim=key_query_value_dim, num_heads=num_attention_heads, with_hard_concrete_gate=with_hard_concrete_gate)
        self.norm = nn.LayerNorm(hidden_dim)
        return