def __init__(self, config, src_vocab, target_vocab, s_v, t_v, u):
        super(Transformer, self).__init__()
        self.config = config

        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)

        attncross = MultiHeadedAttention(h, d_model * 2)
        ffcross = PositionwiseFeedForward(d_model * 2, d_ff, dropout)
        positioncross = PositionalEncoding(d_model * 2, dropout)

        self.encoder = Encoder(
            EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff),
                         dropout), N)
        self.encoder_cross = EncoderCross(
            EncoderLayerCross((config.d_model) * 2, deepcopy(attncross),
                              deepcopy(ffcross), dropout), N)
        self.src_embed = nn.Sequential(
            Embeddings(config.d_model, src_vocab, s_v, u),
            deepcopy(position))  # Embeddings followed by PE
        # self.src_embed.weight.data.copy_(src_vocab.vectors)
        self.target_embed = nn.Sequential(
            Embeddings(config.d_model, target_vocab, t_v, u),
            deepcopy(position))
        # self.target_embed.weight.data.copy_(target_vocab.vectors)
        # Fully-Connected Layer
        self.fc = nn.Linear(self.config.d_model, self.config.output_size)
        self.sigmoid = nn.Sigmoid()
        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        self.softmax = nn.Softmax()
Exemple #2
0
 def __init__(self, d_model, d_ff, nheads, drop_prob=0.2):
     
     super(DecoderLayer, self).__init__()
     self.dec_attn_layer = MultiHeadedAttention(d_model, nheads, drop_prob)
     self.enc_dec_attn_layer = MultiHeadedAttention(d_model, nheads, drop_prob)
     self.ffn = PositionWiseFeedForwardNetwork(d_model, d_ff, drop_prob)
     self.layer_norm1 = LayerNorm(d_model)
     self.layer_norm2 = LayerNorm(d_model)
     self.layer_norm3 = LayerNorm(d_model)
     self.dropout = nn.Dropout(drop_prob)
    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden,
                                                    d_ff=feed_forward_hidden,
                                                    dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
    def load_model(self, path):
        V = len(self.vocab.char2id)
        d_model = 256
        d_ff = 1024
        h = 4
        n_encoders = 4

        self_attn = MultiHeadedAttention(h=h,
                                         d_model=d_model,
                                         d_k=d_model // h,
                                         d_v=d_model // h,
                                         dropout=0.1)
        feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=d_ff)
        position = PositionalEncoding(d_model, dropout=0.1)
        embedding = nn.Sequential(Embeddings(d_model=d_model, vocab=V),
                                  position)

        encoder = Encoder(self_attn=self_attn,
                          feed_forward=feed_forward,
                          size=d_model,
                          dropout=0.1)
        generator = Generator3(d_model=d_model, vocab_size=V)
        model = Bert(encoder=encoder,
                     embedding=embedding,
                     generator=generator,
                     n_layers=n_encoders)
        checkpoint = torch.load(path, map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        return model
Exemple #5
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    c = copy.deepcopy

    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)

    model = EncoderDecoder(
        encoder=Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        decoder=Decoder(
            DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        src_embed=nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        tgt_embed=nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        generator=Generator(d_model, tgt_vocab))

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model
Exemple #6
0
    def __init__(self, config, src_vocab):
        super(Transformer, self).__init__()
        self.config = config
        self.src_vocab = src_vocab

        # 超参数
        # h是多头数量, N是层数, dropout是比率
        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        # 词向量维度,全连接维度
        d_model, d_ff = self.config.d_model, self.config.d_ff

        # 多头注意力层
        attn = MultiHeadedAttention(h, d_model)
        # 全连接层
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        # 位置向量
        position = PositionalEncoding(d_model, dropout)

        self.encoder = Encoder(
            EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff),
                         dropout), N)
        self.src_embed = nn.Sequential(
            Embedding(self.config.d_model, self.src_vocab),
            deepcopy(position))  # embedding with position encoding

        self.fc = nn.Linear(self.config.d_model, self.config.output_size)
        self.softmax = nn.Softmax()
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """
    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden,
                                                    d_ff=feed_forward_hidden,
                                                    dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(
            x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.feed_forward)
        return self.dropout(x)
Exemple #8
0
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
Exemple #9
0
    def __init__(self, config, src_vocab):
        super(Transformer, self).__init__()
        self.config = config

        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.encoder = Encoder(
            EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff),
                         dropout), N)
        self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab))

        self.fc = nn.Linear(self.config.d_model, self.config.output_size)

        self.softmax = nn.Softmax()
Exemple #10
0
    def __init__(self, config):
        super(Transformer, self).__init__()
        self.config = config

        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        position = PositionalEncoding(d_model, dropout)

        self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N)
        # self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab),
        #                                deepcopy(position))  # Embeddings followed by PE

        # Fully-Connected Layer
        self.fc = nn.Linear(
            self.config.d_model,
            self.config.output_size
        )
Exemple #11
0
    def __init__(self, attention_heads, dropout):

        layer_norm_residual_fn = lambda module, input_size, output_size: \
            LayerNormResidual(module, input_size, output_size, dropout)

        self.attention_fn = lambda query_size, mem_size, output_size: \
            layer_norm_residual_fn(MultiHeadedAttention(
                query_size, mem_size, mem_size, output_size, attention_heads),
                query_size, output_size)

        self.feed_forward_fn = lambda input_size, output_size: \
            layer_norm_residual_fn(
                nn.Sequential(
                    nn.Linear(input_size, output_size * 4),
                    nn.ReLU(),
                    nn.Linear(output_size * 4, output_size)
                ),
                input_size, output_size)

        super(TransformerLayer, self).__init__()
Exemple #12
0
    def __init__(self, config, pre_train_weight, embedding_size):
        super(Transformer, self).__init__()
        self.config = config
        self.pre_train_weight = pre_train_weight
        self.embedding_size = embedding_size
        
        # 超参数
        # h是多头数量, N是层数, dropout是比率
        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        # 词向量维度,全连接维度
        d_model, d_ff = self.config.d_model, self.config.d_ff

        # 多头注意力层
        attn = MultiHeadedAttention(h, d_model)
        # 全连接层
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)
        # 位置向量
        position = PositionalEncoding(d_model, dropout)

        self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N)
        self.src_embed = nn.Sequential(Embedding(self.config.d_model, self.pre_train_weight, self.embedding_size), deepcopy(position)) # embedding with position encoding
Exemple #13
0
    def __init__(self, config, src_vocab):
        super(Transformer, self).__init__()
        self.config = config

        h, N, dropout = self.config.h, self.config.N, self.config.dropout
        d_model, d_ff = self.config.d_model, self.config.d_ff
        self.src_vocab = src_vocab

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.encoder_layer = EncoderLayer(config.d_model, deepcopy(attn),
                                          deepcopy(ff), dropout)
        self.encoder = Encoder(self.encoder_layer, N)

        self.src_word_emb = nn.Embedding(src_vocab,
                                         config.d_model,
                                         padding_idx=0)

        # self.pos_bias = nn.Embedding(src_vocab, config.d_model, padding_idx=0)
        # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_dim(src_vocab, config.d_model, padding_idx=0),freeze=True)
        # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_vocab(src_vocab, config.d_model, padding_idx=0),freeze=True)

        # self.pos_bias = nn.Embedding(1, config.d_model, padding_idx=0)
        # self.pos_bias = nn.Embedding(src_vocab, 1, padding_idx=0)
        # self.position_enc = nn.Embedding(src_vocab, config.d_model, padding_idx=0)
        self.position_enc = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(src_vocab,
                                        config.d_model,
                                        padding_idx=0),
            freeze=False)

        # position_enc = torch.randn(1000, config.d_model)
        # position_enc = position_enc.unsqueeze(0)
        # self.register_buffer('position_enc', position_enc)

        self.drop = nn.Dropout(p=dropout)
        self.fc = nn.Linear(self.config.d_model, self.config.output_size)

        self.softmax = nn.Softmax()
Exemple #14
0
def make_transformer_model(src_vocab, tgt_vocab, config):
    "Make a transformer model base on config."
    c = copy.deepcopy
    attn = MultiHeadedAttention(config['h'], config['d_model'])
    ff = PositionwiseFeedForward(config['d_model'], config['d_ff'], config['dropout'])
    position = PositionalEncoding(config['d_model'], config['dropout'])
    # word_embed = nn.Sequential(Embeddings(config['d_model'], src_vocab), c(position))
    embed, position = Embeddings(config['d_model'], src_vocab), c(position)
    model = EncoderDecoder(
        Encoder(EncoderLayer(config['d_model'], c(attn), c(ff), config['dropout']), config['num_layer']),
        Decoder(DecoderLayer(config['d_model'], c(attn), c(attn),
                             c(ff), config['dropout']), config['num_layer'], config['d_model'], tgt_vocab, config['pointer_gen']),
        embed, position, embed, position,
        config['tie_weights']
        )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
Exemple #15
0
    def __init__(self, layer, N, d_model, vocab, pointer_gen, ner_last):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)

        if pointer_gen:
            print('pointer_gen')
            self.bptr = nn.Parameter(torch.FloatTensor(1, 1))
            self.Wh = nn.Linear(d_model, 1)

            self.Wx = nn.Linear(d_model, 1)
            self.pointer_gen = True
        else:
            self.pointer_gen = False
        self.sm = nn.Softmax(dim=-1)
        self.ner_last = ner_last
        if self.ner_last:  # if last layer ner -> 2 * d_model (ner concat with x)
            self.proj = nn.Linear(2 * d_model, vocab)
            self.norm = LayerNorm(2 * layer.size)
            self.Ws = nn.Linear(2 * d_model, 1)
            self.ner_attn = MultiHeadedAttention(1, d_model, 0.3)
        else:
            self.proj = nn.Linear(d_model, vocab)
            self.norm = LayerNorm(layer.size)
            self.Ws = nn.Linear(d_model, 1)
def create_batch(batch_size, n_batches):
    for _ in range(n_batches):
        chars = torch.from_numpy(
            np.random.randint(2, 28, size=(batch_size, 10))).long()
        batch = Batch(chars, None, pad_token)
        yield batch


V = 26 + 1 + 1
d_model = 256
h = 8

self_attn = MultiHeadedAttention(h=h,
                                 d_model=d_model,
                                 d_k=d_model // h,
                                 d_v=d_model // h,
                                 dropout=0.)
feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=1024)
embedding = Embeddings(d_model=d_model, vocab=V)

encoder = Encoder(self_attn=self_attn,
                  feed_forward=feed_forward,
                  size=d_model,
                  dropout=0.)
generator = Generator(d_model=d_model, vocab_size=V)
model = Bert(encoder=encoder,
             embedding=embedding,
             generator=generator,
             n_layers=4)
    def __init__(self, opt):
        """Basic model building blocks."""
        nn.Module.__init__(self)
        self.opt = opt

        if self.opt.arc_combine_method == 'attention':
            #self.attention = MultiHeadedAttention(h=self.opt.attn_heads, d_model=hidden)
            print(
                f'\n\nAttn_Input \n{self.opt.inputSize + self.opt.keySize}\n\n'
            )
            self.attention = MultiHeadedAttention(h=1,
                                                  d_model=self.opt.inputSize +
                                                  self.opt.keySize)
        else:
            self.attention = None

        if self.opt.grapheme_combination != 'None':
            self.is_graphemic = True

            if self.opt.grapheme_encoding:
                self.grapheme_encoder = GraphemeEncoder(self.opt)
                self.grapheme_attention = LuongAttention(
                    attn_type=self.opt.grapheme_combination,
                    num_features=self.opt.grapheme_hidden_size * 2,
                    initialisation=self.opt.init_grapheme)
                self.has_grapheme_encoding = True
            else:
                self.grapheme_attention = LuongAttention(
                    attn_type=self.opt.grapheme_combination,
                    num_features=self.opt.grapheme_features,
                    initialisation=self.opt.init_grapheme)
                self.has_grapheme_encoding = False
        else:
            self.is_graphemic = False

        num_directions = 2 if self.opt.bidirectional else 1

        if self.opt.encoder_type == 'ATTENTION':
            self.model_intermediate = Encoder(
                self.opt.inputSize,
                self.opt.hiddenSize,
                self.opt.hiddenSize,
                self.opt.init_word,
                self.opt.nLSTMLayers,
                use_bias=True,
                birdirectional=self.opt.bidirectional,
                attention=self.attention,
                attention_order=self.opt.attention_order,
                attention_key=self.opt.attention_key,
                dropout=self.opt.intermediate_dropout)
        else:
            self.model_intermediate = LSTM(
                LSTMCell,
                self.opt.inputSize,
                self.opt.hiddenSize,
                self.opt.nLSTMLayers,
                use_bias=True,
                bidirectional=self.opt.bidirectional,
                attention=self.attention)

        self.model_output = DNN_output(num_directions * self.opt.hiddenSize,
                                       self.opt.linearSize,
                                       1,
                                       self.opt.nFCLayers,
                                       self.opt.init_word,
                                       use_bias=True,
                                       logit=True)
Exemple #18
0
 def __init__(self, d_model, vocab_size, nheads):
     super(PointerGeneratorWithCoverage, self).__init__()
     self.fc = nn.Linear(2 * d_model, vocab_size)
     self.p_gen_fc = nn.Linear(2 * d_model, 1)
     self.single_head_attn1 = MultiHeadedAttention(d_model, 1)
     self.single_head_attn2 = AttentionWithCoverage(d_model, 400)