Exemple #1
0
    def __init__(self, config):
        super(Net, self).__init__()
        [self.d_l, self.d_a, self.d_v] = config['input_dims']
        [self.dh_l, self.dh_a, self.dh_v] = config["h_dims"]

        self.final_dims = config["final_dims"]

        self.h_dim = config["h_dim"]

        self.wordLSTM = nn.LSTM(self.d_l, self.dh_l, bidirectional=False)
        self.covarepLSTM = nn.LSTM(self.d_a, self.dh_a, bidirectional=False)
        self.facetLSTM = nn.LSTM(self.d_v, self.dh_v, bidirectional=False)

        self.coattention = Coattention(self.dh_l, self.dh_a)
        self.coattention1 = Coattention(self.dh_l, self.dh_v)
        self.coattention2 = Coattention(self.dh_v, self.dh_a)

        self.aggregateLSTM = nn.LSTM(self.dh_a * 2 + self.dh_v * 2 + self.dh_l,
                                     self.h_dim,
                                     bidirectional=False,
                                     batch_first=True)

        self.dropout1 = nn.Dropout(config["dropout1"])
        self.dropout2 = nn.Dropout(config["dropout2"])

        self.wordLSTMlinear = nn.Linear(self.dh_l, self.dh_l)
        self.wordLSTMfinal = nn.Linear(self.dh_l, 1)

        self.attention = Attention(self.dh_l)
        self.outputlinear = nn.Linear(self.h_dim * 2, self.final_dims)
        self.finallinear = nn.Linear(self.final_dims, 1)
Exemple #2
0
def test_attention(config):
    attention = Attention(config, config.attn_type, 1024, 1024)
    h_s = torch.randn(7, 36, 1024)
    h_t = torch.randn(7, 5, 1024)
    m_s = torch.randn(7, 36).random_(0, 2)
    context, scores = attention(h_t, h_s, m_s)
    print(context.size(), scores.size())
Exemple #3
0
    def __init__(self, config, embedding):

        super(RNNDecoder, self).__init__()

        # embedding
        self.embedding = embedding
        self.embedding_size = embedding.embedding_dim

        # dropout
        self.dropout = nn.Dropout(config.dropout)

        self.rnn = nn.GRU(input_size=self.embedding_size,
                          hidden_size=config.hidden_size,
                          num_layers=config.dec_num_layers,
                          dropout=config.dropout)

        init_gru_orth(self.rnn)

        self.enc_attn = Attention(config.hidden_size)

        self.linear = nn.Linear(config.hidden_size, config.vocab_size)
        init_linear_wt(self.linear)

        if config.tied:
            self.linear.weight = self.embedding.weight
Exemple #4
0
def test_attention(config):
    attention = Attention(config.attn_type, 256, 128)
    h_s = torch.randn(5, 6, 256)
    h_t = torch.randn(5, 5, 128)
    src_mask = torch.randn(5, 6).random_(0, 2)
    context, scores = attention(h_t, h_s, src_mask)

    print('context.size()', context.size())
    print('scores.size()', scores.size())
Exemple #5
0
def build_baseline(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    fusion = FCNet([num_hid, num_hid * 2], dropout=0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, fusion, num_hid,
                     dataset.num_ans_candidates)
Exemple #6
0
    def __init__(self, input_scheme, args):
        super(ATTRNNAgent, self).__init__()
        self.args = args

        fixed_inputs = []
        var_inputs = []
        idx = 0
        len_fixed = 0
        split = []
        for part in input_scheme:
            if type(part) == int:
                # part: len
                fixed_inputs.append((idx, part))
                idx += part
                len_fixed += part
                split.append(part)
            else:
                # part: len * n
                var_inputs.append((idx, part[0], part[1]))
                idx += part[0] * part[1]
                split.append(part[0] * part[1])

        attns = []
        vfc1s = []
        vfc2s = []
        n_var = len(var_inputs)
        len_attn = 0
        ffc1 = nn.Linear(len_fixed, args.attn_hidden_dim)
        for i in range(n_var):
            vfc1s.append(nn.Linear(var_inputs[i][1], args.attn_hidden_dim))
            attns.append(
                Attention(args.attn_hidden_dim, args.attn_hidden_dim,
                          args.attn_hidden_dim, args.attn_n_heads))
            # print(var_inputs[i][1])
            vfc2s.append(
                nn.Linear(args.attn_hidden_dim * args.attn_n_heads,
                          args.attn_hidden_dim))
            len_attn += args.attn_hidden_dim

        ffc2 = nn.Linear(args.attn_hidden_dim, args.attn_hidden_dim)
        len_attn += args.attn_hidden_dim

        self.split = split
        self.input_scheme = input_scheme
        self.attns = nn.ModuleList(attns)
        self.vfc1s = nn.ModuleList(vfc1s)
        self.vfc2s = nn.ModuleList(vfc2s)
        self.ffc1 = ffc1
        self.ffc2 = ffc2

        self.fc1 = nn.Linear(len_attn, args.rnn_hidden_dim)
        if args.use_rnn:
            self.rnn = nn.GRUCell(args.rnn_hidden_dim, args.rnn_hidden_dim)
        else:
            self.rnn = None
        # print(args.n_actions)
        self.fc2 = nn.Linear(args.rnn_hidden_dim, args.n_actions)
Exemple #7
0
def build_baseline(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    a_mask = AnswerMask(num_hid, dataset.num_ans_candidates)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, a_mask)
Exemple #8
0
    def __init__(self, embedding, training):
        super(Decoder, self).__init__()
        self._config = DecoderConfig()
        self._training = training

        self._key_size = EncoderConfig().hidden_size
        self._value_size = self._key_size
        self._context_size = AttentionConfig().context_size

        self._embedding = embedding

        self._forward_step = {
            'bahdanau': self._bahdanau_step,
            'luong': self._luong_step
        }[self._config.attention_mechanism]

        self._attention = Attention(query_size=self._config.hidden_size,
                                    key_size=self._key_size,
                                    value_size=self._value_size)

        rnn_input_size = {
            'bahdanau': self._embedding.embedding_size + self._context_size,
            'luong': self._embedding.embedding_size
        }[self._config.attention_mechanism]

        mlp_input_size = {
            'bahdanau':
            self._embedding.embedding_size + self._config.hidden_size +
            self._context_size,
            'luong':
            self._config.hidden_size + self._context_size
        }[self._config.attention_mechanism]

        self._rnn = Decoder.RNN_FACTORY[self._config.rnn_type](
            input_size=rnn_input_size,
            hidden_size=self._config.hidden_size,
            num_layers=self._config.num_layers,
            bidirectional=False,
            bias=self._config.rnn_bias,
            dropout=(self._config.rnn_dropout_probability if self._training
                     and self._config.rnn_dropout_enabled else 0))

        self._mlp = nn.Sequential(
            nn.Linear(mlp_input_size, self._config.hidden_size), nn.Tanh(),
            nn.Linear(self._config.hidden_size,
                      self._embedding.vocabulary.size))

        self._teacher_forcing = TeacherForcing()
Exemple #9
0
    def __init__(self, params):
        super(RNNAutoEncoder, self).__init__()

        self.params = params

        self.embedding = nn.Embedding(self.params["vocab_size"],
                                      self.params["embedding_size"])
        nn.init.kaiming_uniform(self.embedding.weight, mode='fan_in')
        self.dropout_emb = nn.Dropout(0.3)
        self.embedding_full = nn.Sequential(self.embedding, self.dropout_emb)

        self.encoder = Encoder(self.params, self.embedding_full)

        for name, p in self.encoder.named_parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform(p)
            else:
                p.data.uniform_(-0.1, 0.1)
            if "input_to_logvar.bias" in name:
                p.data.uniform_(-10.0, -5.0)

        self.dropout_decoder = nn.Dropout(self.params["dropout"])

        self.decoders = nn.ModuleList()
        for i in range(self.params["nb_decoders"]):
            decoder = nn.LSTM(input_size=self.params["embedding_size"],
                              hidden_size=self.params["decoder_hidden_size"],
                              num_layers=self.params["num_decoder_layers"],
                              batch_first=True,
                              bidirectional=False,
                              dropout=self.params["dropout"])
            for p in decoder.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform(p)
                else:
                    p.data.uniform_(-0.1, 0.1)
            self.decoders.append(decoder)

        self.attention = Attention(self.params["decoder_hidden_size"])
        self.normalizer = nn.Linear(self.params["decoder_hidden_size"],
                                    self.params["vocab_size"])
        self.normalizer.weight.data.uniform_(-0.1, 0.1)
        self.train()
        self.step = 0
        self.to_track = -1
        self.offset = 0
Exemple #10
0
    def __init__(self, config):
        super(WhereDecoder, self).__init__()

        self.cfg = config

        #################################################################
        # Dimensions
        #################################################################
        # If the encoder is bidirectional, double the size of the hidden state
        factor = 2 if config.bidirectional else 1
        src_dim = factor * config.n_src_hidden
        tgt_dim = factor * config.n_tgt_hidden
        emb_dim = config.n_embed
        bgf_dim = config.n_conv_hidden
        fgf_dim = config.output_cls_size

        #################################################################
        # Attention
        #################################################################
        if self.cfg.what_attn and self.cfg.where_attn > 0:
            in_dim = fgf_dim
            out_dim = src_dim
            if self.cfg.attn_emb:
                out_dim += emb_dim
            if self.cfg.where_attn == 2:
                in_dim += out_dim
            self.attention = Attention(config.attn_type, out_dim, in_dim)
            # print('in_dim',  in_dim)
            # print('out_dim', out_dim)

            if self.cfg.where_attn_2d:
                in_dim_2d = out_dim + tgt_dim
                if self.cfg.use_bn:
                    self.spatial_attn = nn.Sequential(
                        nn.Conv2d(in_dim_2d,
                                  tgt_dim // 2,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1),
                        nn.BatchNorm2d(tgt_dim // 2),
                        nn.ReLU(True),
                        nn.Conv2d(tgt_dim // 2,
                                  1,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1),
                    )
                else:
                    self.spatial_attn = nn.Sequential(
                        nn.Conv2d(in_dim_2d,
                                  tgt_dim // 2,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1),
                        # nn.BatchNorm2d(tgt_dim//2),
                        nn.ReLU(True),
                        nn.Conv2d(tgt_dim // 2,
                                  1,
                                  kernel_size=3,
                                  stride=1,
                                  padding=1),
                    )

        #################################################################
        # Location decoder
        #################################################################
        if self.cfg.what_attn:
            input_dim = tgt_dim + fgf_dim + src_dim
            if self.cfg.attn_emb:
                input_dim += emb_dim
        else:
            input_dim = tgt_dim + fgf_dim

        if self.cfg.use_bg_to_locate:
            input_dim += bgf_dim
        # print('input_dim',  input_dim)

        if config.use_bn:
            self.decoder = nn.Sequential(
                nn.Conv2d(input_dim,
                          tgt_dim,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                nn.BatchNorm2d(tgt_dim),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim,
                          tgt_dim // 2,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                nn.BatchNorm2d(tgt_dim // 2),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim // 2,
                          tgt_dim // 2,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                nn.BatchNorm2d(tgt_dim // 2),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim // 2, 18, kernel_size=3, stride=1,
                          padding=1),
            )
        else:
            self.decoder = nn.Sequential(
                nn.Conv2d(input_dim,
                          tgt_dim,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                # nn.BatchNorm2d(tgt_dim),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim,
                          tgt_dim // 2,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                # nn.BatchNorm2d(tgt_dim//2),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim // 2,
                          tgt_dim // 2,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                # nn.BatchNorm2d(tgt_dim//2),
                nn.ReLU(True),
                nn.Conv2d(tgt_dim // 2, 18, kernel_size=3, stride=1,
                          padding=1),
            )
Exemple #11
0
    def __init__(self, config):
        super(WhatDecoder, self).__init__()

        self.cfg = config

        #################################################################
        # Dimensions
        #################################################################
        # If the encoder is bidirectional, double the size of the hidden state
        factor = 2 if config.bidirectional else 1
        src_dim = factor * config.n_src_hidden
        tgt_dim = factor * config.n_tgt_hidden

        emb_dim = config.n_embed
        bgf_dim = config.n_conv_hidden
        fgf_dim = config.output_cls_size

        #################################################################
        # Conv RNN
        #################################################################
        input_dim = bgf_dim
        if config.use_fg_to_pred == 1:  # use prev_fg_onehot (previous foreground object label) as input for current object prediction
            input_dim += fgf_dim
        rnn_cell = config.rnn_cell.lower()
        if rnn_cell == 'gru':
            self.rnn = ConvGRU(input_dim,
                               tgt_dim,
                               config.n_rnn_layers,
                               3,
                               bias=True,
                               dropout=config.rnn_dropout_p)
        elif rnn_cell == 'lstm':
            self.rnn = ConvLSTM(input_dim,
                                tgt_dim,
                                config.n_rnn_layers,
                                3,
                                bias=True,
                                dropout=config.rnn_dropout_p)
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))

        #################################################################
        # Spatial attention
        #################################################################
        if self.cfg.attn_2d:
            if self.cfg.use_bn:
                self.spatial_attn = nn.Sequential(
                    nn.Conv2d(tgt_dim,
                              tgt_dim // 2,
                              kernel_size=3,
                              stride=1,
                              padding=1),
                    nn.BatchNorm2d(tgt_dim // 2),
                    nn.ReLU(True),
                    nn.Conv2d(tgt_dim // 2,
                              1,
                              kernel_size=3,
                              stride=1,
                              padding=1),
                )
            else:
                self.spatial_attn = nn.Sequential(
                    nn.Conv2d(tgt_dim,
                              tgt_dim // 2,
                              kernel_size=3,
                              stride=1,
                              padding=1),
                    # nn.BatchNorm2d(tgt_dim//2),
                    nn.ReLU(True),
                    nn.Conv2d(tgt_dim // 2,
                              1,
                              kernel_size=3,
                              stride=1,
                              padding=1),
                )

        #################################################################
        # Attention
        #################################################################
        if self.cfg.what_attn:
            in_dim = tgt_dim
            out_dim = src_dim
            if self.cfg.attn_emb:
                out_dim += emb_dim
            # if self.cfg.use_bg_to_pred:
            #     in_dim += bgf_dim
            if self.cfg.use_fg_to_pred == 2:
                in_dim += fgf_dim
            self.attention = Attention(config.attn_type, out_dim, in_dim)

        #################################################################
        # Segment pooling
        #################################################################
        if self.cfg.hidden_pooling_mode == 0:
            self.seg_pool = nn.AvgPool1d(3)
        else:
            self.seg_pool = nn.MaxPool1d(3)

        #################################################################
        # Object decoder
        #################################################################
        input_dim = tgt_dim
        if self.cfg.what_attn:
            input_dim += src_dim
            if self.cfg.attn_emb:
                input_dim += emb_dim
        if self.cfg.use_bg_to_pred:
            input_dim += bgf_dim
        if self.cfg.use_fg_to_pred == 2:
            input_dim += fgf_dim

        hidden_dim = tgt_dim
        self.decoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=True),
            # nn.BatchNorm1d(hidden_dim),
            nn.ReLU(True),
            nn.Linear(hidden_dim, fgf_dim))
Exemple #12
0
    def __init__(self,
                 params,
                 vocab_size,
                 embed_size,
                 hidden_size,
                 *,
                 enc_attn=True,
                 dec_attn=True,
                 enc_attn_cover=True,
                 pointer=True,
                 tied_embedding=None,
                 out_embed_size=None,
                 in_drop: float = 0,
                 rnn_drop: float = 0,
                 out_drop: float = 0,
                 enc_hidden_size=None,
                 enc_attn_temporal=None,
                 attn_func_name=None):
        super(DecoderRNN, self).__init__()
        self.params = params
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size  # decoder hidden size 如果没有指定,就是encoder hidden的两倍
        self.combined_size = self.hidden_size

        self.attn_func_name = attn_func_name
        self.enc_attn = enc_attn
        self.enc_attn_temporal = enc_attn_temporal
        self.dec_attn = dec_attn
        self.enc_attn_cover = enc_attn_cover
        self.pointer = pointer

        self.out_embed_size = out_embed_size
        #  the output word embeddings are tied to the input ones
        if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size:
            print(
                "Warning: Output embedding size %d is overriden by its tied embedding size %d."
                % (self.out_embed_size, embed_size))
            self.out_embed_size = embed_size  # 使用输入的embed_size来直接作为out_embed_size

        self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None
        self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop)

        if enc_attn:
            if not enc_hidden_size:
                enc_hidden_size = self.hidden_size

            # if attn_func_name.lower() == "tanh":
            #     self.enc_attn_func =
            # elif attn_func_name.lower() == "bilinear":
            #     self.enc_attn_func = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
            #     # self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
            # else:
            #     print("Attention function should be tanh or bilinear!")
            #     exit(0)

            # hidden_size : decoder hidden size
            # enc_hidden_size : encoder total size
            print("Using {} attention.".format(params.attn_func_name))
            self.attn = Attention(method=params.attn_func_name,
                                  batch_size=params.batch_size,
                                  decoder_hidden_size=hidden_size,
                                  encoder_total_size=enc_hidden_size,
                                  encoder_hidden_size=params.hidden_size)

            self.combined_size += enc_hidden_size  # decoder hidden + encoder hidden
            if enc_attn_cover:
                self.cover_weight = nn.Parameter(torch.rand(1))

        if dec_attn:
            self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size,
                                            1)
            self.combined_size += self.hidden_size  # decoder hidden + decoder hidden

        self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None

        if pointer:
            # 算出copy概率
            self.ptr = nn.Linear(self.combined_size, 1)

        if tied_embedding is not None and embed_size != self.combined_size:
            # use pre_out layer if combined size is different from embedding size
            self.out_embed_size = embed_size

        if self.out_embed_size:  # use pre_out layer
            self.pre_out = nn.Linear(self.combined_size, self.out_embed_size)
            size_before_output = self.out_embed_size
        else:  # don't use pre_out layer
            size_before_output = self.combined_size

        self.out = nn.Linear(size_before_output, vocab_size)
        if tied_embedding is not None:
            self.out.weight = tied_embedding.weight
Exemple #13
0
class DecoderRNN(nn.Module):
    def __init__(self,
                 params,
                 vocab_size,
                 embed_size,
                 hidden_size,
                 *,
                 enc_attn=True,
                 dec_attn=True,
                 enc_attn_cover=True,
                 pointer=True,
                 tied_embedding=None,
                 out_embed_size=None,
                 in_drop: float = 0,
                 rnn_drop: float = 0,
                 out_drop: float = 0,
                 enc_hidden_size=None,
                 enc_attn_temporal=None,
                 attn_func_name=None):
        super(DecoderRNN, self).__init__()
        self.params = params
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size  # decoder hidden size 如果没有指定,就是encoder hidden的两倍
        self.combined_size = self.hidden_size

        self.attn_func_name = attn_func_name
        self.enc_attn = enc_attn
        self.enc_attn_temporal = enc_attn_temporal
        self.dec_attn = dec_attn
        self.enc_attn_cover = enc_attn_cover
        self.pointer = pointer

        self.out_embed_size = out_embed_size
        #  the output word embeddings are tied to the input ones
        if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size:
            print(
                "Warning: Output embedding size %d is overriden by its tied embedding size %d."
                % (self.out_embed_size, embed_size))
            self.out_embed_size = embed_size  # 使用输入的embed_size来直接作为out_embed_size

        self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None
        self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop)

        if enc_attn:
            if not enc_hidden_size:
                enc_hidden_size = self.hidden_size

            # if attn_func_name.lower() == "tanh":
            #     self.enc_attn_func =
            # elif attn_func_name.lower() == "bilinear":
            #     self.enc_attn_func = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
            #     # self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1)
            # else:
            #     print("Attention function should be tanh or bilinear!")
            #     exit(0)

            # hidden_size : decoder hidden size
            # enc_hidden_size : encoder total size
            print("Using {} attention.".format(params.attn_func_name))
            self.attn = Attention(method=params.attn_func_name,
                                  batch_size=params.batch_size,
                                  decoder_hidden_size=hidden_size,
                                  encoder_total_size=enc_hidden_size,
                                  encoder_hidden_size=params.hidden_size)

            self.combined_size += enc_hidden_size  # decoder hidden + encoder hidden
            if enc_attn_cover:
                self.cover_weight = nn.Parameter(torch.rand(1))

        if dec_attn:
            self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size,
                                            1)
            self.combined_size += self.hidden_size  # decoder hidden + decoder hidden

        self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None

        if pointer:
            # 算出copy概率
            self.ptr = nn.Linear(self.combined_size, 1)

        if tied_embedding is not None and embed_size != self.combined_size:
            # use pre_out layer if combined size is different from embedding size
            self.out_embed_size = embed_size

        if self.out_embed_size:  # use pre_out layer
            self.pre_out = nn.Linear(self.combined_size, self.out_embed_size)
            size_before_output = self.out_embed_size
        else:  # don't use pre_out layer
            size_before_output = self.combined_size

        self.out = nn.Linear(size_before_output, vocab_size)
        if tied_embedding is not None:
            self.out.weight = tied_embedding.weight

    def forward(self,
                embedded,
                hidden,
                encoder_states=None,
                decoder_states=None,
                coverage_vector=None,
                *,
                encoder_word_idx=None,
                ext_vocab_size: int = None,
                log_prob: bool = True,
                mask=None):
        """
        :param embedded: (batch size, embed size) 一个一个词语输入
        :param hidden: (1, batch size, decoder hidden size) encoder的输出
        :param encoder_states: (src seq len, batch size, hidden size), for attention mechanism
        :param decoder_states: (past dec steps, batch size, hidden size), for attention mechanism
        :param encoder_word_idx: (src seq len, batch size), for pointer network
        :param ext_vocab_size: the dynamic vocab size, determined by the max num of OOV words contained
                               in any src seq in this batch, for pointer network
        :param log_prob: return log probability instead of probability
        :return: tuple of four things:
                 1. word prob or log word prob, (batch size, dynamic vocab size);
                 2. RNN hidden state after this step, (1, batch size, decoder hidden size);
                 3. attention weights over encoder states, (batch size, src seq len);
                 4. prob of copying by pointing as opposed to generating, (batch size, 1)

        Perform single-step decoding.
        """
        batch_size = embedded.size(0)  # (batch size, embed size) 相当于每句的第一个词语
        # 用来结合hidden、context vector等
        combined = torch.zeros(batch_size, self.combined_size, device=DEVICE)

        if self.params.debug:
            print("combined size:{}".format(combined.size()))

        if self.in_drop:
            # embeded dropout
            embedded = self.in_drop(embedded)

        output, hidden = self.gru(
            embedded.unsqueeze(0), hidden
        )  # (1, batch size, embed size) unsqueeze and squeeze are necessary
        if self.params.debug:
            print("Decoder output:")
            print("     Output:{}".format(
                output.size()))  # (1, batch size, embed size)
            print("     Hidden:{}".format(
                hidden.size()))  # (1, batch size, embed size)

        # hidden:(1, batch, hidden)
        combined[:, :self.hidden_size] = output.squeeze(
            0)  # (batch, hidden) as RNN expects a 3D tensor (step=1)

        # 为了下一次的偏移
        offset = self.hidden_size
        enc_attn, prob_ptr = None, None  # for visualization

        # enc_attn: 得到context vector,可以只用context vector来计算gen概率,不是用copy,因此不是一个参数
        # pointer: 使用copy机制,使用copy就必须用到context vector
        if self.enc_attn or self.pointer:
            # energy and attention: (num encoder states, batch size, 1)
            # encoder_states就是encoder的output(src_len, batch, hidden)
            num_enc_steps = encoder_states.size(0)  # 和多少个做attention
            enc_total_size = encoder_states.size(2)  # 维度
            if self.params.debug:
                print("num_enc_steps:{}".format(num_enc_steps))
                print("enc_total_size:{}".format(enc_total_size))

            enc_attn = self.attn.forward(hidden, encoder_states, mask)

            # # 拿当前时刻的hidden开始计算attn weight
            # # 扩展hidden
            # # enc_energy: (src_len, batch, 1)
            # enc_energy = self.enc_attn_func(hidden.expand(num_enc_steps, batch_size, -1).contiguous(),
            #                                 encoder_states)
            # if self.params.debug:
            #     print("enc_energy:{}".format(enc_energy))
            #     print("enc_energy size:{}".format(enc_energy.size()))
            #
            # # # use coverage
            # # # 对应论文,这里的Attention需要考虑之前计算的attn
            # # if self.enc_attn_cover and self.enc_attn_temporal and coverage_vector is not None:
            # #     if self.params.debug:
            # #         print("cover_weight:{}".format(self.cover_weight))
            # #     enc_energy += self.cover_weight * torch.log(coverage_vector.transpose(0, 1).unsqueeze(2) + eps)
            # # transpose => (batch size, num encoder states, 1)
            # enc_attn = F.softmax(enc_energy, dim=0).transpose(0, 1)

            if self.params.debug:
                print("enc_attn:{}".format(enc_attn))
                print("enc_attn size:{}".format(
                    enc_attn.size()))  # (batch, src_len, 1)

            # get context
            if self.enc_attn:
                # context: (batch size, encoder hidden size, 1)
                enc_context = torch.bmm(encoder_states.permute(1, 2, 0),
                                        enc_attn)
                if self.params.debug:
                    print("enc_context size:{}".format(
                        enc_context.size()))  # (batch, hidden, 1)
                combined[:, offset:offset +
                         enc_total_size] = enc_context.squeeze(2)
                offset += enc_total_size
            enc_attn = enc_attn.squeeze(2)

        if self.dec_attn:
            if decoder_states is not None and len(decoder_states) > 0:
                dec_energy = self.dec_bilinear(
                    hidden.expand_as(decoder_states).contiguous(),
                    decoder_states)
                dec_attn = F.softmax(dec_energy, dim=0).transpose(0, 1)
                dec_context = torch.bmm(decoder_states.permute(1, 2, 0),
                                        dec_attn)
                combined[:, offset:offset +
                         self.hidden_size] = dec_context.squeeze(2)
            offset += self.hidden_size

        if self.out_drop:
            combined = self.out_drop(combined)

        # generator
        if self.out_embed_size:
            # 在映射到词表之前,先映射到一个输出维度
            out_embed = self.pre_out(combined)
        else:
            out_embed = combined

        # 输出到vocab维度
        logits = self.out(out_embed)  # (batch size, vocab size)

        # pointer
        if self.pointer:
            output = torch.zeros(batch_size, ext_vocab_size, device=DEVICE)

            # distribute probabilities between generator and pointer
            prob_ptr = F.sigmoid(self.ptr(combined))  # (batch size, 1)
            prob_gen = 1 - prob_ptr
            # add generator probabilities to output
            gen_output = F.softmax(
                logits,
                dim=1)  # can't use log_softmax due to adding probabilities
            output[:, :self.vocab_size] = prob_gen * gen_output
            # add pointer probabilities to output
            ptr_output = enc_attn  # (batch, src_len)
            # encoder_word_idx: encoder input tensor (src_len, batch)
            # encoder_word_idx.transpose(0, 1) ==> (batch, src_len)
            # 这部分的作用:之前的output前面是原有vocab的生成概率,后面根据copy对应相加,copy原来vocab中有的,继续加到已有
            # 的概率,没有的就是copy概率。
            output.scatter_add_(1, encoder_word_idx.transpose(0, 1),
                                prob_ptr * ptr_output)
            if self.params.debug:
                print("output size:{}".format(output.size()))
            if log_prob:
                output = torch.log(output + eps)
        else:
            if log_prob:
                output = F.log_softmax(logits, dim=1)
            else:
                output = F.softmax(logits, dim=1)

        # output (bacth, ext_vocab_size)
        # hidden (1, batch, hidden)
        # enc_attn (batch size, src_len)
        # prob_ptr (batch size, 1)
        return output, hidden, enc_attn, prob_ptr
Exemple #14
0
    def __init__(self, config):
        super(WhereDecoder, self).__init__()
        self.cfg = config
        if self.cfg.use_separable_convolution:
            conv2d = separable_conv2d
        else:
            conv2d = nn.Conv2d
        #################################################################
        # Dimensions
        #################################################################
        # If the encoder is bidirectional, double the size of the hidden state
        factor = 2 if config.bidirectional else 1
        src_dim = factor * config.n_src_hidden
        tgt_dim = factor * config.n_tgt_hidden
        emb_dim = config.n_embed
        bgf_dim = config.n_conv_hidden
        fgf_dim = config.output_vocab_size

        #################################################################
        # Attention
        #################################################################
        if self.cfg.what_attn and self.cfg.where_attn > 0:
            in_dim = fgf_dim
            out_dim = src_dim
            if self.cfg.attn_emb:
                out_dim += emb_dim
            if self.cfg.where_attn == 2:
                in_dim += out_dim
            self.attention = Attention(config.attn_type, out_dim, in_dim)

            if self.cfg.where_attn_2d:
                in_dim_2d = out_dim + tgt_dim
                attn2d_layers = []
                if self.cfg.use_normalization:
                    attn2d_layers.append(
                        conv2d(in_dim_2d,
                               tgt_dim // 2,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias=False))
                    attn2d_layers.append(
                        nn.LayerNorm([
                            tgt_dim // 2, self.cfg.grid_size[0],
                            self.cfg.grid_size[1]
                        ]))
                else:
                    attn2d_layers.append(
                        conv2d(in_dim_2d,
                               tgt_dim // 2,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias=True))
                attn2d_layers.append(nn.LeakyReLU(0.2, inplace=True))
                attn2d_layers.append(
                    conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1,
                           padding=1))
                self.spatial_attn = nn.Sequential(*attn2d_layers)

        #################################################################
        # Location decoder
        #################################################################
        input_dim = tgt_dim + fgf_dim
        if self.cfg.what_attn:
            input_dim += src_dim
            if self.cfg.attn_emb:
                input_dim += emb_dim

        if self.cfg.use_bg_to_locate:
            input_dim += bgf_dim

        output_dim = 1 + self.cfg.num_scales + self.cfg.num_ratios + self.cfg.n_patch_features

        if config.use_normalization:
            self.decoder = nn.Sequential(
                conv2d(input_dim,
                       tgt_dim,
                       kernel_size=3,
                       stride=1,
                       padding=1,
                       bias=False),
                nn.LayerNorm(
                    [tgt_dim, self.cfg.grid_size[0], self.cfg.grid_size[1]]),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim,
                       tgt_dim // 2,
                       kernel_size=3,
                       stride=1,
                       padding=1,
                       bias=False),
                nn.LayerNorm([
                    tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1]
                ]),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim // 2,
                       tgt_dim // 2,
                       kernel_size=3,
                       stride=1,
                       padding=1,
                       bias=False),
                nn.LayerNorm([
                    tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1]
                ]),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim // 2,
                       output_dim,
                       kernel_size=3,
                       stride=1,
                       padding=1),
            )
        else:
            self.decoder = nn.Sequential(
                conv2d(input_dim, tgt_dim, kernel_size=3, stride=1, padding=1),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim,
                       tgt_dim // 2,
                       kernel_size=3,
                       stride=1,
                       padding=1),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim // 2,
                       tgt_dim // 2,
                       kernel_size=3,
                       stride=1,
                       padding=1),
                nn.LeakyReLU(0.2, inplace=True),
                conv2d(tgt_dim // 2,
                       output_dim,
                       kernel_size=3,
                       stride=1,
                       padding=1),
            )

        self.init_weights()
Exemple #15
0
    def __init__(self, config):
        super(WhatDecoder, self).__init__()

        self.cfg = config
        # whether to use separable conv2d
        if self.cfg.use_separable_convolution:
            conv2d = separable_conv2d
        else:
            conv2d = nn.Conv2d
        #################################################################
        # Dimensions for the recurrent model
        #################################################################
        # If the encoder is bidirectional, double the size of the hidden state
        factor = 2 if config.bidirectional else 1
        src_dim = factor * config.n_src_hidden
        tgt_dim = factor * config.n_tgt_hidden

        emb_dim = config.n_embed
        bgf_dim = config.n_conv_hidden
        fgf_dim = config.output_vocab_size

        #################################################################
        # Conv RNN
        #################################################################
        input_dim = bgf_dim
        if config.use_fg_to_pred == 1:
            # use prev_fg_onehot as input, seems not working
            input_dim += fgf_dim
        rnn_cell = config.rnn_cell.lower()
        if rnn_cell == 'gru':
            self.rnn = ConvGRU(input_dim,
                               tgt_dim,
                               config.n_rnn_layers,
                               3,
                               bias=True,
                               dropout=config.rnn_dropout_p)
        elif rnn_cell == 'lstm':
            self.rnn = ConvLSTM(input_dim,
                                tgt_dim,
                                config.n_rnn_layers,
                                3,
                                bias=True,
                                dropout=config.rnn_dropout_p)
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))

        #################################################################
        # Spatial attention
        #################################################################
        if self.cfg.what_attn_2d:
            attn2d_layers = []
            if self.cfg.use_normalization:
                attn2d_layers.append(
                    conv2d(tgt_dim,
                           tgt_dim // 2,
                           kernel_size=3,
                           stride=1,
                           padding=1,
                           bias=False))
                attn2d_layers.append(
                    nn.LayerNorm([
                        tgt_dim // 2, self.cfg.grid_size[0],
                        self.cfg.grid_size[1]
                    ]))
            else:
                attn2d_layers.append(
                    conv2d(tgt_dim,
                           tgt_dim // 2,
                           kernel_size=3,
                           stride=1,
                           padding=1,
                           bias=True))
            attn2d_layers.append(nn.LeakyReLU(0.2, inplace=True))
            attn2d_layers.append(
                conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1))
            self.spatial_attn = nn.Sequential(*attn2d_layers)

        #################################################################
        # Attention
        #################################################################
        if self.cfg.what_attn:
            in_dim = tgt_dim
            out_dim = src_dim
            if self.cfg.attn_emb:
                # whether to include the language embedding vector as output
                out_dim += emb_dim
            # if self.cfg.use_bg_to_pred:
            #     in_dim += bgf_dim
            if self.cfg.use_fg_to_pred == 2:
                in_dim += fgf_dim
            self.attention = Attention(config.attn_type, out_dim, in_dim)

        #################################################################
        # Object decoder
        #################################################################
        input_dim = tgt_dim
        if self.cfg.what_attn:
            input_dim += src_dim
            if self.cfg.attn_emb:
                input_dim += emb_dim
        if self.cfg.use_bg_to_pred:
            input_dim += bgf_dim
        if self.cfg.use_fg_to_pred == 2:
            input_dim += fgf_dim

        hidden_dim = tgt_dim
        self.decoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=True),
            nn.LeakyReLU(0.2, inplace=True), nn.Linear(hidden_dim, fgf_dim))

        self.init_weights()