def __init__(self, args): super().__init__() self.args = args cattimes = 3 if args.title else 2 self.emb = nn.Embedding(args.ntoks, args.hsz) self.lstm = nn.LSTMCell(args.hsz * cattimes, args.hsz) self.out = nn.Linear(args.hsz * cattimes, args.tgttoks) self.le = list_encode(args) self.entout = nn.Linear(args.hsz, 1) self.switch = nn.Linear(args.hsz * cattimes, 1) self.attn = MultiHeadAttention(args.hsz, args.hsz, args.hsz, h=4, dropout_p=args.drop) self.mattn = MatrixAttn(args.hsz * cattimes, args.hsz) self.graph = (args.model in ['graph', 'gat', 'gtrans']) print(args.model) if self.graph: self.ge = graph_encode(args) if args.plan: self.splan = splanner(args) if args.title: self.tenc = lseq_encode(args, toks=args.ninput) self.attn2 = MultiHeadAttention(args.hsz, args.hsz, args.hsz, h=4, dropout_p=args.drop) self.mix = nn.Linear(args.hsz, 1)
def __init__(self, **kwargs): super(DecoderLayer, self).__init__(**kwargs) with self.name_scope(): self.self_masked_attention = MultiHeadAttention() self.context_attention = MultiHeadAttention() self.feed_forward = FeedForward()
def encoder_layer(units, d_model, num_heads, dropout, name="encoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask") attention = MultiHeadAttention(d_model, num_heads, name="attention")({ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': padding_mask }) attention = tf.keras.layers.Dropout(rate=dropout)(attention) attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs) return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)
def __init__(self,args): super().__init__() self.renc = nn.Embedding(args.rtoks,args.hsz) nn.init.xavier_normal_(self.renc.weight) #self.gat = StackedSelfAttentionEncoder(args.hsz,args.hsz,args.hsz,args.hsz,args.prop,args.heads,use_positional_encoding=False) self.gat = nn.ModuleList([MultiHeadAttention(args.hsz,args.hsz,args.hsz,h=4,dropout_p=args.drop) for _ in range(args.prop)]) self.gat = nn.ModuleList([Block(args) for _ in range(args.prop)]) self.prop = args.prop
def __init__(self,args): super().__init__() self.attn = MultiHeadAttention(args.hsz,args.hsz,args.hsz,h=4,dropout_p=args.drop) self.l1 = nn.Linear(args.hsz,args.hsz*4) self.l2 = nn.Linear(args.hsz*4,args.hsz) self.ln_1 = nn.LayerNorm(args.hsz) self.ln_2 = nn.LayerNorm(args.hsz) self.drop = nn.Dropout(args.drop) self.act = gelu
def __init__(self, d_h, dropout_p): super().__init__() self.attn = MultiHeadAttention(d_h, d_h, d_h, h=4, dropout_p=dropout_p) self.l1 = nn.Linear(d_h, d_h*4) self.l2 = nn.Linear(d_h *4, d_h) self.ln_1 = nn.LayerNorm(d_h) self.ln_2 = nn.LayerNorm(d_h) self.drop = nn.Dropout(dropout_p) self.act = gelu
def __init__(self, args): super().__init__() self.attn = MultiHeadAttention(args, h=4, dropout_prob=args.transformer_drop) self.linear1 = nn.Linear(args.hsz, args.hsz * 4) self.linear2 = nn.Linear(args.hsz * 4, args.hsz) self.layer_norm1 = nn.LayerNorm(args.hsz) self.layer_norm2 = nn.LayerNorm(args.hsz) self.dropout = nn.Dropout(args.drop) self.prelu = nn.PReLU(args.hsz * 4)
def __init__(self, args): super().__init__() self.args = args cat_times = 3 if args.title else 2 self.embed = nn.Embedding(args.output_vocab_size, args.hsz) self.entity_encoder = EntityEncoder(args) self.graph_encoder = GraphEncoder(args) if args.title: self.title_encoder = LSTMEncoder(args, toks=args.input_vocab_size) self.attention_title = MultiHeadAttention(args, h=4, dropout_prob=args.drop) # attention_title: computes c_s (decoding-phase context vector for title) self.decoder = nn.LSTMCell(args.hsz * cat_times, args.hsz) self.attention_graph = MultiHeadAttention(args, h=4, dropout_prob=args.drop) # attention_graph: compute c_g (decoding-phase context vector for graph) self.mat_attention = SingleHeadAttention(args.hsz * cat_times, args.hsz, args.device) self.switch = nn.Linear(args.hsz * cat_times, 1) self.out = nn.Linear(args.hsz * cat_times, args.target_vocab_size)
def test_multihead_attention(): with torch.no_grad(): kq_dim = 4 v_dim = 8 num_heads = 16 hidden_dim = 64 batch_size = 3 seq_len = 7 attention_input = torch.rand((batch_size, seq_len, hidden_dim)) mha = MultiHeadAttention(hidden_dim, key_and_query_dim=kq_dim, value_dim=v_dim, num_heads=num_heads) mha_ouptut = mha.forward(q_hidden_inputs=attention_input, k_hidden_inputs=attention_input, v_hidden_inputs=attention_input, mask=None) assert mha_ouptut.size() == attention_input.size()
class TransformerEncoderDecoderConnectionBlock(nn.Module): def __init__(self, hidden_dim: int, key_query_value_dim: int = 64, num_attention_heads=8, with_hard_concrete_gate=False): super(TransformerEncoderDecoderConnectionBlock, self).__init__() self.multihead_attention = MultiHeadAttention(hidden_dim, key_and_query_dim=key_query_value_dim, value_dim=key_query_value_dim, num_heads=num_attention_heads, with_hard_concrete_gate=with_hard_concrete_gate) self.norm = nn.LayerNorm(hidden_dim) return def forward(self, encoder_outputs, decoder_hidden, mask=None): attention_outputs = self.multihead_attention.forward(q_hidden_inputs=decoder_hidden, k_hidden_inputs=encoder_outputs, v_hidden_inputs=encoder_outputs, mask=mask) return self.norm(decoder_hidden + attention_outputs)
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"): inputs = tf.keras.Input(shape=(None, d_model), name="inputs") enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs") look_ahead_mask = tf.keras.Input(shape=(1, None, None), name="look_ahead_mask") padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask') attention1 = MultiHeadAttention(d_model, num_heads, name="attention_1")(inputs={ 'query': inputs, 'key': inputs, 'value': inputs, 'mask': look_ahead_mask }) attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention1 + inputs) attention2 = MultiHeadAttention(d_model, num_heads, name="attention_2")(inputs={ 'query': attention1, 'key': enc_outputs, 'value': enc_outputs, 'mask': padding_mask }) attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2) attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention2 + attention1) outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention2) outputs = tf.keras.layers.Dense(units=d_model)(outputs) outputs = tf.keras.layers.Dropout(rate=dropout)(outputs) outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(outputs + attention2) return tf.keras.Model( inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask], outputs=outputs, name=name)
def __init__(self, d_model, num_heads, epsilon=1e-6, rate=0.1): super(AttentionBlock, self).__init__() self.num_heads = num_heads self.d_model = d_model self.epsilon = epsilon self.rate = rate self.mha = MultiHeadAttention(d_model, num_heads) self.ln1 = tf.keras.layers.LayerNormalization(epsilon=self.epsilon) self.ln2 = tf.keras.layers.LayerNormalization(epsilon=self.epsilon) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.ffn = point_wise_feed_forward_network(d_model, d_model)
def __init__( self, device: torch.device, num_vocabs: int, embedding_dim: int = 512, hidden_size: int = 512, num_layers: int = 2, num_head: int = 8, max_len: int = 120, dropout: float = 0.3, rnn_type: str = 'lstm', attn_mechanism: str = 'multi_head', smoothing: bool = False, sos_id: int = 1, eos_id: int = 2, ) -> None: super(Decoder, self).__init__() rnn_cell = self.supported_rnns[rnn_type] self.embedding = nn.Embedding(num_vocabs, embedding_dim) self.rnn = rnn_cell(embedding_dim, hidden_size, num_layers, True, True, dropout, bidirectional=False) self.num_vocabs = num_vocabs self.hidden_size = hidden_size self.num_layers = num_layers self.num_head = num_head self.max_len = max_len self.embedding_dropout = nn.Dropout(p=dropout) self.sos_id = sos_id self.eos_id = eos_id self.attn_mechanism = attn_mechanism self.device = device self.fc = nn.Linear(hidden_size << 1, num_vocabs) if self.attn_mechanism == 'location': self.attention = LocationAwareAttention(hidden_size, hidden_size, smoothing) elif self.attn_mechanism == 'scaled_dot': self.attention = ScaledDotProductAttention(hidden_size) elif self.attn_mechanism == 'multi_head': self.attention = MultiHeadAttention(hidden_size, num_head)
def __init__(self, args): super().__init__() self.args = args self.renc = nn.Embedding(args.rtoks, args.hsz) nn.init.xavier_normal_(self.renc.weight) if args.model == "gat": self.gat = nn.ModuleList([ MultiHeadAttention(args.hsz, args.hsz, args.hsz, h=4, dropout_p=args.drop) for _ in range(args.prop) ]) else: self.gat = nn.ModuleList([Block(args) for _ in range(args.prop)]) self.prop = args.prop self.sparse = args.sparse
def test_multihead_attention_invalid_args(): with pytest.raises(ValueError): MultiHeadAttention(16, key_and_query_dim=4, value_dim=4, num_heads=5) with pytest.raises(ValueError): SimpleMultiHeadAttention(16, key_query_value_dim=4, num_heads=5)
def __init__(self, hidden_dim: int, key_query_value_dim: int = 64, num_attention_heads=8, with_hard_concrete_gate=False): super(TransformerEncoderDecoderConnectionBlock, self).__init__() self.multihead_attention = MultiHeadAttention(hidden_dim, key_and_query_dim=key_query_value_dim, value_dim=key_query_value_dim, num_heads=num_attention_heads, with_hard_concrete_gate=with_hard_concrete_gate) self.norm = nn.LayerNorm(hidden_dim) return