Example #1
0
 def __init__(self, layer1dim, layer2dim, optim):
     super().__init__(optim)
     self.layer1 = Linear(layer1dim)
     self.relu1 = Relu()
     self.bn1 = BatchNorm()
     self.layer2 = Linear(layer2dim)
     self.loss = CrossEntropy()
     self.loss_ = None
Example #2
0
    def __init__(self,
                 num_head,
                 num_dim,
                 num_dim_k,
                 num_dim_v,
                 dropout_rate=0.1):
        """
        num_head: the number of head
        num_dim: the number of dimension of each query word and key
        num_dim_k: the number of dimension query and key will mapping to 
        num_dim_v: the number of dimension value will mapping to 
        """
        super(MultiHeadAttention, self).__init__()
        self.num_head = num_head
        self.num_dim = num_dim
        self.num_dim_k = num_dim_k
        self.num_dim_v = num_dim_v

        # parameter w_q, w_v, w_k for all head
        self.w_q = nn.Parameter(torch.FloatTensor(num_head, num_dim,
                                                  num_dim_k))
        self.w_k = nn.Parameter(torch.FloatTensor(num_head, num_dim,
                                                  num_dim_k))
        self.w_v = nn.Parameter(torch.FloatTensor(num_head, num_dim,
                                                  num_dim_v))
        nn.init.xavier_normal(self.w_q)
        nn.init.xavier_normal(self.w_k)
        nn.init.xavier_normal(self.w_k)

        self.attention = ScaledDotProductAttention(num_dim)
        self.project = Linear(num_head * num_dim_v, num_dim)

        self.dropout = nn.Dropout(dropout_rate)
Example #3
0
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 n_max_seq,
                 n_layers=6,
                 n_head=8,
                 d_word_vec=512,
                 d_model=512,
                 d_inner_hid=1024,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 proj_share_weight=True,
                 embs_share_weight=True):

        super(Transformer, self).__init__()
        self.encoder = Encoder(n_src_vocab,
                               n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)
        self.decoder = Decoder(n_tgt_vocab,
                               n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
        self.dropout = nn.Dropout(dropout)

        assert d_model == d_word_vec, \
        'To facilitate the residual connections, \
         the dimensions of all module output shall be the same.'

        if proj_share_weight:
            # Share the weight matrix between tgt word embedding/projection
            assert d_model == d_word_vec
            self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight

        if embs_share_weight:
            # Share the weight matrix between src/tgt word embeddings
            # assume the src/tgt word vec size are the same
            assert n_src_vocab == n_tgt_vocab, \
            "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
Example #4
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_model)
        self.attention = LayerNormalization(d_model)
        self.proj = Linear(n_head*d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        init.xavier_normal(self.w_qs)
        init.xavier_normal(self.w_ks)
        init.xavier_normal(self.w_vs)