def __init__(self, config): super(Net, self).__init__() [self.d_l, self.d_a, self.d_v] = config['input_dims'] [self.dh_l, self.dh_a, self.dh_v] = config["h_dims"] self.final_dims = config["final_dims"] self.h_dim = config["h_dim"] self.wordLSTM = nn.LSTM(self.d_l, self.dh_l, bidirectional=False) self.covarepLSTM = nn.LSTM(self.d_a, self.dh_a, bidirectional=False) self.facetLSTM = nn.LSTM(self.d_v, self.dh_v, bidirectional=False) self.coattention = Coattention(self.dh_l, self.dh_a) self.coattention1 = Coattention(self.dh_l, self.dh_v) self.coattention2 = Coattention(self.dh_v, self.dh_a) self.aggregateLSTM = nn.LSTM(self.dh_a * 2 + self.dh_v * 2 + self.dh_l, self.h_dim, bidirectional=False, batch_first=True) self.dropout1 = nn.Dropout(config["dropout1"]) self.dropout2 = nn.Dropout(config["dropout2"]) self.wordLSTMlinear = nn.Linear(self.dh_l, self.dh_l) self.wordLSTMfinal = nn.Linear(self.dh_l, 1) self.attention = Attention(self.dh_l) self.outputlinear = nn.Linear(self.h_dim * 2, self.final_dims) self.finallinear = nn.Linear(self.final_dims, 1)
def test_attention(config): attention = Attention(config, config.attn_type, 1024, 1024) h_s = torch.randn(7, 36, 1024) h_t = torch.randn(7, 5, 1024) m_s = torch.randn(7, 36).random_(0, 2) context, scores = attention(h_t, h_s, m_s) print(context.size(), scores.size())
def __init__(self, config, embedding): super(RNNDecoder, self).__init__() # embedding self.embedding = embedding self.embedding_size = embedding.embedding_dim # dropout self.dropout = nn.Dropout(config.dropout) self.rnn = nn.GRU(input_size=self.embedding_size, hidden_size=config.hidden_size, num_layers=config.dec_num_layers, dropout=config.dropout) init_gru_orth(self.rnn) self.enc_attn = Attention(config.hidden_size) self.linear = nn.Linear(config.hidden_size, config.vocab_size) init_linear_wt(self.linear) if config.tied: self.linear.weight = self.embedding.weight
def test_attention(config): attention = Attention(config.attn_type, 256, 128) h_s = torch.randn(5, 6, 256) h_t = torch.randn(5, 5, 128) src_mask = torch.randn(5, 6).random_(0, 2) context, scores = attention(h_t, h_s, src_mask) print('context.size()', context.size()) print('scores.size()', scores.size())
def build_baseline(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) fusion = FCNet([num_hid, num_hid * 2], dropout=0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, fusion, num_hid, dataset.num_ans_candidates)
def __init__(self, input_scheme, args): super(ATTRNNAgent, self).__init__() self.args = args fixed_inputs = [] var_inputs = [] idx = 0 len_fixed = 0 split = [] for part in input_scheme: if type(part) == int: # part: len fixed_inputs.append((idx, part)) idx += part len_fixed += part split.append(part) else: # part: len * n var_inputs.append((idx, part[0], part[1])) idx += part[0] * part[1] split.append(part[0] * part[1]) attns = [] vfc1s = [] vfc2s = [] n_var = len(var_inputs) len_attn = 0 ffc1 = nn.Linear(len_fixed, args.attn_hidden_dim) for i in range(n_var): vfc1s.append(nn.Linear(var_inputs[i][1], args.attn_hidden_dim)) attns.append( Attention(args.attn_hidden_dim, args.attn_hidden_dim, args.attn_hidden_dim, args.attn_n_heads)) # print(var_inputs[i][1]) vfc2s.append( nn.Linear(args.attn_hidden_dim * args.attn_n_heads, args.attn_hidden_dim)) len_attn += args.attn_hidden_dim ffc2 = nn.Linear(args.attn_hidden_dim, args.attn_hidden_dim) len_attn += args.attn_hidden_dim self.split = split self.input_scheme = input_scheme self.attns = nn.ModuleList(attns) self.vfc1s = nn.ModuleList(vfc1s) self.vfc2s = nn.ModuleList(vfc2s) self.ffc1 = ffc1 self.ffc2 = ffc2 self.fc1 = nn.Linear(len_attn, args.rnn_hidden_dim) if args.use_rnn: self.rnn = nn.GRUCell(args.rnn_hidden_dim, args.rnn_hidden_dim) else: self.rnn = None # print(args.n_actions) self.fc2 = nn.Linear(args.rnn_hidden_dim, args.n_actions)
def build_baseline(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) a_mask = AnswerMask(num_hid, dataset.num_ans_candidates) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, a_mask)
def __init__(self, embedding, training): super(Decoder, self).__init__() self._config = DecoderConfig() self._training = training self._key_size = EncoderConfig().hidden_size self._value_size = self._key_size self._context_size = AttentionConfig().context_size self._embedding = embedding self._forward_step = { 'bahdanau': self._bahdanau_step, 'luong': self._luong_step }[self._config.attention_mechanism] self._attention = Attention(query_size=self._config.hidden_size, key_size=self._key_size, value_size=self._value_size) rnn_input_size = { 'bahdanau': self._embedding.embedding_size + self._context_size, 'luong': self._embedding.embedding_size }[self._config.attention_mechanism] mlp_input_size = { 'bahdanau': self._embedding.embedding_size + self._config.hidden_size + self._context_size, 'luong': self._config.hidden_size + self._context_size }[self._config.attention_mechanism] self._rnn = Decoder.RNN_FACTORY[self._config.rnn_type]( input_size=rnn_input_size, hidden_size=self._config.hidden_size, num_layers=self._config.num_layers, bidirectional=False, bias=self._config.rnn_bias, dropout=(self._config.rnn_dropout_probability if self._training and self._config.rnn_dropout_enabled else 0)) self._mlp = nn.Sequential( nn.Linear(mlp_input_size, self._config.hidden_size), nn.Tanh(), nn.Linear(self._config.hidden_size, self._embedding.vocabulary.size)) self._teacher_forcing = TeacherForcing()
def __init__(self, params): super(RNNAutoEncoder, self).__init__() self.params = params self.embedding = nn.Embedding(self.params["vocab_size"], self.params["embedding_size"]) nn.init.kaiming_uniform(self.embedding.weight, mode='fan_in') self.dropout_emb = nn.Dropout(0.3) self.embedding_full = nn.Sequential(self.embedding, self.dropout_emb) self.encoder = Encoder(self.params, self.embedding_full) for name, p in self.encoder.named_parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) else: p.data.uniform_(-0.1, 0.1) if "input_to_logvar.bias" in name: p.data.uniform_(-10.0, -5.0) self.dropout_decoder = nn.Dropout(self.params["dropout"]) self.decoders = nn.ModuleList() for i in range(self.params["nb_decoders"]): decoder = nn.LSTM(input_size=self.params["embedding_size"], hidden_size=self.params["decoder_hidden_size"], num_layers=self.params["num_decoder_layers"], batch_first=True, bidirectional=False, dropout=self.params["dropout"]) for p in decoder.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) else: p.data.uniform_(-0.1, 0.1) self.decoders.append(decoder) self.attention = Attention(self.params["decoder_hidden_size"]) self.normalizer = nn.Linear(self.params["decoder_hidden_size"], self.params["vocab_size"]) self.normalizer.weight.data.uniform_(-0.1, 0.1) self.train() self.step = 0 self.to_track = -1 self.offset = 0
def __init__(self, config): super(WhereDecoder, self).__init__() self.cfg = config ################################################################# # Dimensions ################################################################# # If the encoder is bidirectional, double the size of the hidden state factor = 2 if config.bidirectional else 1 src_dim = factor * config.n_src_hidden tgt_dim = factor * config.n_tgt_hidden emb_dim = config.n_embed bgf_dim = config.n_conv_hidden fgf_dim = config.output_cls_size ################################################################# # Attention ################################################################# if self.cfg.what_attn and self.cfg.where_attn > 0: in_dim = fgf_dim out_dim = src_dim if self.cfg.attn_emb: out_dim += emb_dim if self.cfg.where_attn == 2: in_dim += out_dim self.attention = Attention(config.attn_type, out_dim, in_dim) # print('in_dim', in_dim) # print('out_dim', out_dim) if self.cfg.where_attn_2d: in_dim_2d = out_dim + tgt_dim if self.cfg.use_bn: self.spatial_attn = nn.Sequential( nn.Conv2d(in_dim_2d, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(tgt_dim // 2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1), ) else: self.spatial_attn = nn.Sequential( nn.Conv2d(in_dim_2d, tgt_dim // 2, kernel_size=3, stride=1, padding=1), # nn.BatchNorm2d(tgt_dim//2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1), ) ################################################################# # Location decoder ################################################################# if self.cfg.what_attn: input_dim = tgt_dim + fgf_dim + src_dim if self.cfg.attn_emb: input_dim += emb_dim else: input_dim = tgt_dim + fgf_dim if self.cfg.use_bg_to_locate: input_dim += bgf_dim # print('input_dim', input_dim) if config.use_bn: self.decoder = nn.Sequential( nn.Conv2d(input_dim, tgt_dim, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(tgt_dim), nn.ReLU(True), nn.Conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(tgt_dim // 2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(tgt_dim // 2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 18, kernel_size=3, stride=1, padding=1), ) else: self.decoder = nn.Sequential( nn.Conv2d(input_dim, tgt_dim, kernel_size=3, stride=1, padding=1), # nn.BatchNorm2d(tgt_dim), nn.ReLU(True), nn.Conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1), # nn.BatchNorm2d(tgt_dim//2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, tgt_dim // 2, kernel_size=3, stride=1, padding=1), # nn.BatchNorm2d(tgt_dim//2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 18, kernel_size=3, stride=1, padding=1), )
def __init__(self, config): super(WhatDecoder, self).__init__() self.cfg = config ################################################################# # Dimensions ################################################################# # If the encoder is bidirectional, double the size of the hidden state factor = 2 if config.bidirectional else 1 src_dim = factor * config.n_src_hidden tgt_dim = factor * config.n_tgt_hidden emb_dim = config.n_embed bgf_dim = config.n_conv_hidden fgf_dim = config.output_cls_size ################################################################# # Conv RNN ################################################################# input_dim = bgf_dim if config.use_fg_to_pred == 1: # use prev_fg_onehot (previous foreground object label) as input for current object prediction input_dim += fgf_dim rnn_cell = config.rnn_cell.lower() if rnn_cell == 'gru': self.rnn = ConvGRU(input_dim, tgt_dim, config.n_rnn_layers, 3, bias=True, dropout=config.rnn_dropout_p) elif rnn_cell == 'lstm': self.rnn = ConvLSTM(input_dim, tgt_dim, config.n_rnn_layers, 3, bias=True, dropout=config.rnn_dropout_p) else: raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell)) ################################################################# # Spatial attention ################################################################# if self.cfg.attn_2d: if self.cfg.use_bn: self.spatial_attn = nn.Sequential( nn.Conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(tgt_dim // 2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1), ) else: self.spatial_attn = nn.Sequential( nn.Conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1), # nn.BatchNorm2d(tgt_dim//2), nn.ReLU(True), nn.Conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1), ) ################################################################# # Attention ################################################################# if self.cfg.what_attn: in_dim = tgt_dim out_dim = src_dim if self.cfg.attn_emb: out_dim += emb_dim # if self.cfg.use_bg_to_pred: # in_dim += bgf_dim if self.cfg.use_fg_to_pred == 2: in_dim += fgf_dim self.attention = Attention(config.attn_type, out_dim, in_dim) ################################################################# # Segment pooling ################################################################# if self.cfg.hidden_pooling_mode == 0: self.seg_pool = nn.AvgPool1d(3) else: self.seg_pool = nn.MaxPool1d(3) ################################################################# # Object decoder ################################################################# input_dim = tgt_dim if self.cfg.what_attn: input_dim += src_dim if self.cfg.attn_emb: input_dim += emb_dim if self.cfg.use_bg_to_pred: input_dim += bgf_dim if self.cfg.use_fg_to_pred == 2: input_dim += fgf_dim hidden_dim = tgt_dim self.decoder = nn.Sequential( nn.Linear(input_dim, hidden_dim, bias=True), # nn.BatchNorm1d(hidden_dim), nn.ReLU(True), nn.Linear(hidden_dim, fgf_dim))
def __init__(self, params, vocab_size, embed_size, hidden_size, *, enc_attn=True, dec_attn=True, enc_attn_cover=True, pointer=True, tied_embedding=None, out_embed_size=None, in_drop: float = 0, rnn_drop: float = 0, out_drop: float = 0, enc_hidden_size=None, enc_attn_temporal=None, attn_func_name=None): super(DecoderRNN, self).__init__() self.params = params self.vocab_size = vocab_size self.hidden_size = hidden_size # decoder hidden size 如果没有指定,就是encoder hidden的两倍 self.combined_size = self.hidden_size self.attn_func_name = attn_func_name self.enc_attn = enc_attn self.enc_attn_temporal = enc_attn_temporal self.dec_attn = dec_attn self.enc_attn_cover = enc_attn_cover self.pointer = pointer self.out_embed_size = out_embed_size # the output word embeddings are tied to the input ones if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size: print( "Warning: Output embedding size %d is overriden by its tied embedding size %d." % (self.out_embed_size, embed_size)) self.out_embed_size = embed_size # 使用输入的embed_size来直接作为out_embed_size self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop) if enc_attn: if not enc_hidden_size: enc_hidden_size = self.hidden_size # if attn_func_name.lower() == "tanh": # self.enc_attn_func = # elif attn_func_name.lower() == "bilinear": # self.enc_attn_func = nn.Bilinear(self.hidden_size, enc_hidden_size, 1) # # self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1) # else: # print("Attention function should be tanh or bilinear!") # exit(0) # hidden_size : decoder hidden size # enc_hidden_size : encoder total size print("Using {} attention.".format(params.attn_func_name)) self.attn = Attention(method=params.attn_func_name, batch_size=params.batch_size, decoder_hidden_size=hidden_size, encoder_total_size=enc_hidden_size, encoder_hidden_size=params.hidden_size) self.combined_size += enc_hidden_size # decoder hidden + encoder hidden if enc_attn_cover: self.cover_weight = nn.Parameter(torch.rand(1)) if dec_attn: self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size, 1) self.combined_size += self.hidden_size # decoder hidden + decoder hidden self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None if pointer: # 算出copy概率 self.ptr = nn.Linear(self.combined_size, 1) if tied_embedding is not None and embed_size != self.combined_size: # use pre_out layer if combined size is different from embedding size self.out_embed_size = embed_size if self.out_embed_size: # use pre_out layer self.pre_out = nn.Linear(self.combined_size, self.out_embed_size) size_before_output = self.out_embed_size else: # don't use pre_out layer size_before_output = self.combined_size self.out = nn.Linear(size_before_output, vocab_size) if tied_embedding is not None: self.out.weight = tied_embedding.weight
class DecoderRNN(nn.Module): def __init__(self, params, vocab_size, embed_size, hidden_size, *, enc_attn=True, dec_attn=True, enc_attn_cover=True, pointer=True, tied_embedding=None, out_embed_size=None, in_drop: float = 0, rnn_drop: float = 0, out_drop: float = 0, enc_hidden_size=None, enc_attn_temporal=None, attn_func_name=None): super(DecoderRNN, self).__init__() self.params = params self.vocab_size = vocab_size self.hidden_size = hidden_size # decoder hidden size 如果没有指定,就是encoder hidden的两倍 self.combined_size = self.hidden_size self.attn_func_name = attn_func_name self.enc_attn = enc_attn self.enc_attn_temporal = enc_attn_temporal self.dec_attn = dec_attn self.enc_attn_cover = enc_attn_cover self.pointer = pointer self.out_embed_size = out_embed_size # the output word embeddings are tied to the input ones if tied_embedding is not None and self.out_embed_size and embed_size != self.out_embed_size: print( "Warning: Output embedding size %d is overriden by its tied embedding size %d." % (self.out_embed_size, embed_size)) self.out_embed_size = embed_size # 使用输入的embed_size来直接作为out_embed_size self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None self.gru = nn.GRU(embed_size, self.hidden_size, dropout=rnn_drop) if enc_attn: if not enc_hidden_size: enc_hidden_size = self.hidden_size # if attn_func_name.lower() == "tanh": # self.enc_attn_func = # elif attn_func_name.lower() == "bilinear": # self.enc_attn_func = nn.Bilinear(self.hidden_size, enc_hidden_size, 1) # # self.enc_bilinear = nn.Bilinear(self.hidden_size, enc_hidden_size, 1) # else: # print("Attention function should be tanh or bilinear!") # exit(0) # hidden_size : decoder hidden size # enc_hidden_size : encoder total size print("Using {} attention.".format(params.attn_func_name)) self.attn = Attention(method=params.attn_func_name, batch_size=params.batch_size, decoder_hidden_size=hidden_size, encoder_total_size=enc_hidden_size, encoder_hidden_size=params.hidden_size) self.combined_size += enc_hidden_size # decoder hidden + encoder hidden if enc_attn_cover: self.cover_weight = nn.Parameter(torch.rand(1)) if dec_attn: self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size, 1) self.combined_size += self.hidden_size # decoder hidden + decoder hidden self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None if pointer: # 算出copy概率 self.ptr = nn.Linear(self.combined_size, 1) if tied_embedding is not None and embed_size != self.combined_size: # use pre_out layer if combined size is different from embedding size self.out_embed_size = embed_size if self.out_embed_size: # use pre_out layer self.pre_out = nn.Linear(self.combined_size, self.out_embed_size) size_before_output = self.out_embed_size else: # don't use pre_out layer size_before_output = self.combined_size self.out = nn.Linear(size_before_output, vocab_size) if tied_embedding is not None: self.out.weight = tied_embedding.weight def forward(self, embedded, hidden, encoder_states=None, decoder_states=None, coverage_vector=None, *, encoder_word_idx=None, ext_vocab_size: int = None, log_prob: bool = True, mask=None): """ :param embedded: (batch size, embed size) 一个一个词语输入 :param hidden: (1, batch size, decoder hidden size) encoder的输出 :param encoder_states: (src seq len, batch size, hidden size), for attention mechanism :param decoder_states: (past dec steps, batch size, hidden size), for attention mechanism :param encoder_word_idx: (src seq len, batch size), for pointer network :param ext_vocab_size: the dynamic vocab size, determined by the max num of OOV words contained in any src seq in this batch, for pointer network :param log_prob: return log probability instead of probability :return: tuple of four things: 1. word prob or log word prob, (batch size, dynamic vocab size); 2. RNN hidden state after this step, (1, batch size, decoder hidden size); 3. attention weights over encoder states, (batch size, src seq len); 4. prob of copying by pointing as opposed to generating, (batch size, 1) Perform single-step decoding. """ batch_size = embedded.size(0) # (batch size, embed size) 相当于每句的第一个词语 # 用来结合hidden、context vector等 combined = torch.zeros(batch_size, self.combined_size, device=DEVICE) if self.params.debug: print("combined size:{}".format(combined.size())) if self.in_drop: # embeded dropout embedded = self.in_drop(embedded) output, hidden = self.gru( embedded.unsqueeze(0), hidden ) # (1, batch size, embed size) unsqueeze and squeeze are necessary if self.params.debug: print("Decoder output:") print(" Output:{}".format( output.size())) # (1, batch size, embed size) print(" Hidden:{}".format( hidden.size())) # (1, batch size, embed size) # hidden:(1, batch, hidden) combined[:, :self.hidden_size] = output.squeeze( 0) # (batch, hidden) as RNN expects a 3D tensor (step=1) # 为了下一次的偏移 offset = self.hidden_size enc_attn, prob_ptr = None, None # for visualization # enc_attn: 得到context vector,可以只用context vector来计算gen概率,不是用copy,因此不是一个参数 # pointer: 使用copy机制,使用copy就必须用到context vector if self.enc_attn or self.pointer: # energy and attention: (num encoder states, batch size, 1) # encoder_states就是encoder的output(src_len, batch, hidden) num_enc_steps = encoder_states.size(0) # 和多少个做attention enc_total_size = encoder_states.size(2) # 维度 if self.params.debug: print("num_enc_steps:{}".format(num_enc_steps)) print("enc_total_size:{}".format(enc_total_size)) enc_attn = self.attn.forward(hidden, encoder_states, mask) # # 拿当前时刻的hidden开始计算attn weight # # 扩展hidden # # enc_energy: (src_len, batch, 1) # enc_energy = self.enc_attn_func(hidden.expand(num_enc_steps, batch_size, -1).contiguous(), # encoder_states) # if self.params.debug: # print("enc_energy:{}".format(enc_energy)) # print("enc_energy size:{}".format(enc_energy.size())) # # # # use coverage # # # 对应论文,这里的Attention需要考虑之前计算的attn # # if self.enc_attn_cover and self.enc_attn_temporal and coverage_vector is not None: # # if self.params.debug: # # print("cover_weight:{}".format(self.cover_weight)) # # enc_energy += self.cover_weight * torch.log(coverage_vector.transpose(0, 1).unsqueeze(2) + eps) # # transpose => (batch size, num encoder states, 1) # enc_attn = F.softmax(enc_energy, dim=0).transpose(0, 1) if self.params.debug: print("enc_attn:{}".format(enc_attn)) print("enc_attn size:{}".format( enc_attn.size())) # (batch, src_len, 1) # get context if self.enc_attn: # context: (batch size, encoder hidden size, 1) enc_context = torch.bmm(encoder_states.permute(1, 2, 0), enc_attn) if self.params.debug: print("enc_context size:{}".format( enc_context.size())) # (batch, hidden, 1) combined[:, offset:offset + enc_total_size] = enc_context.squeeze(2) offset += enc_total_size enc_attn = enc_attn.squeeze(2) if self.dec_attn: if decoder_states is not None and len(decoder_states) > 0: dec_energy = self.dec_bilinear( hidden.expand_as(decoder_states).contiguous(), decoder_states) dec_attn = F.softmax(dec_energy, dim=0).transpose(0, 1) dec_context = torch.bmm(decoder_states.permute(1, 2, 0), dec_attn) combined[:, offset:offset + self.hidden_size] = dec_context.squeeze(2) offset += self.hidden_size if self.out_drop: combined = self.out_drop(combined) # generator if self.out_embed_size: # 在映射到词表之前,先映射到一个输出维度 out_embed = self.pre_out(combined) else: out_embed = combined # 输出到vocab维度 logits = self.out(out_embed) # (batch size, vocab size) # pointer if self.pointer: output = torch.zeros(batch_size, ext_vocab_size, device=DEVICE) # distribute probabilities between generator and pointer prob_ptr = F.sigmoid(self.ptr(combined)) # (batch size, 1) prob_gen = 1 - prob_ptr # add generator probabilities to output gen_output = F.softmax( logits, dim=1) # can't use log_softmax due to adding probabilities output[:, :self.vocab_size] = prob_gen * gen_output # add pointer probabilities to output ptr_output = enc_attn # (batch, src_len) # encoder_word_idx: encoder input tensor (src_len, batch) # encoder_word_idx.transpose(0, 1) ==> (batch, src_len) # 这部分的作用:之前的output前面是原有vocab的生成概率,后面根据copy对应相加,copy原来vocab中有的,继续加到已有 # 的概率,没有的就是copy概率。 output.scatter_add_(1, encoder_word_idx.transpose(0, 1), prob_ptr * ptr_output) if self.params.debug: print("output size:{}".format(output.size())) if log_prob: output = torch.log(output + eps) else: if log_prob: output = F.log_softmax(logits, dim=1) else: output = F.softmax(logits, dim=1) # output (bacth, ext_vocab_size) # hidden (1, batch, hidden) # enc_attn (batch size, src_len) # prob_ptr (batch size, 1) return output, hidden, enc_attn, prob_ptr
def __init__(self, config): super(WhereDecoder, self).__init__() self.cfg = config if self.cfg.use_separable_convolution: conv2d = separable_conv2d else: conv2d = nn.Conv2d ################################################################# # Dimensions ################################################################# # If the encoder is bidirectional, double the size of the hidden state factor = 2 if config.bidirectional else 1 src_dim = factor * config.n_src_hidden tgt_dim = factor * config.n_tgt_hidden emb_dim = config.n_embed bgf_dim = config.n_conv_hidden fgf_dim = config.output_vocab_size ################################################################# # Attention ################################################################# if self.cfg.what_attn and self.cfg.where_attn > 0: in_dim = fgf_dim out_dim = src_dim if self.cfg.attn_emb: out_dim += emb_dim if self.cfg.where_attn == 2: in_dim += out_dim self.attention = Attention(config.attn_type, out_dim, in_dim) if self.cfg.where_attn_2d: in_dim_2d = out_dim + tgt_dim attn2d_layers = [] if self.cfg.use_normalization: attn2d_layers.append( conv2d(in_dim_2d, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=False)) attn2d_layers.append( nn.LayerNorm([ tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1] ])) else: attn2d_layers.append( conv2d(in_dim_2d, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=True)) attn2d_layers.append(nn.LeakyReLU(0.2, inplace=True)) attn2d_layers.append( conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1)) self.spatial_attn = nn.Sequential(*attn2d_layers) ################################################################# # Location decoder ################################################################# input_dim = tgt_dim + fgf_dim if self.cfg.what_attn: input_dim += src_dim if self.cfg.attn_emb: input_dim += emb_dim if self.cfg.use_bg_to_locate: input_dim += bgf_dim output_dim = 1 + self.cfg.num_scales + self.cfg.num_ratios + self.cfg.n_patch_features if config.use_normalization: self.decoder = nn.Sequential( conv2d(input_dim, tgt_dim, kernel_size=3, stride=1, padding=1, bias=False), nn.LayerNorm( [tgt_dim, self.cfg.grid_size[0], self.cfg.grid_size[1]]), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=False), nn.LayerNorm([ tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1] ]), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim // 2, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=False), nn.LayerNorm([ tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1] ]), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim // 2, output_dim, kernel_size=3, stride=1, padding=1), ) else: self.decoder = nn.Sequential( conv2d(input_dim, tgt_dim, kernel_size=3, stride=1, padding=1), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim // 2, tgt_dim // 2, kernel_size=3, stride=1, padding=1), nn.LeakyReLU(0.2, inplace=True), conv2d(tgt_dim // 2, output_dim, kernel_size=3, stride=1, padding=1), ) self.init_weights()
def __init__(self, config): super(WhatDecoder, self).__init__() self.cfg = config # whether to use separable conv2d if self.cfg.use_separable_convolution: conv2d = separable_conv2d else: conv2d = nn.Conv2d ################################################################# # Dimensions for the recurrent model ################################################################# # If the encoder is bidirectional, double the size of the hidden state factor = 2 if config.bidirectional else 1 src_dim = factor * config.n_src_hidden tgt_dim = factor * config.n_tgt_hidden emb_dim = config.n_embed bgf_dim = config.n_conv_hidden fgf_dim = config.output_vocab_size ################################################################# # Conv RNN ################################################################# input_dim = bgf_dim if config.use_fg_to_pred == 1: # use prev_fg_onehot as input, seems not working input_dim += fgf_dim rnn_cell = config.rnn_cell.lower() if rnn_cell == 'gru': self.rnn = ConvGRU(input_dim, tgt_dim, config.n_rnn_layers, 3, bias=True, dropout=config.rnn_dropout_p) elif rnn_cell == 'lstm': self.rnn = ConvLSTM(input_dim, tgt_dim, config.n_rnn_layers, 3, bias=True, dropout=config.rnn_dropout_p) else: raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell)) ################################################################# # Spatial attention ################################################################# if self.cfg.what_attn_2d: attn2d_layers = [] if self.cfg.use_normalization: attn2d_layers.append( conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=False)) attn2d_layers.append( nn.LayerNorm([ tgt_dim // 2, self.cfg.grid_size[0], self.cfg.grid_size[1] ])) else: attn2d_layers.append( conv2d(tgt_dim, tgt_dim // 2, kernel_size=3, stride=1, padding=1, bias=True)) attn2d_layers.append(nn.LeakyReLU(0.2, inplace=True)) attn2d_layers.append( conv2d(tgt_dim // 2, 1, kernel_size=3, stride=1, padding=1)) self.spatial_attn = nn.Sequential(*attn2d_layers) ################################################################# # Attention ################################################################# if self.cfg.what_attn: in_dim = tgt_dim out_dim = src_dim if self.cfg.attn_emb: # whether to include the language embedding vector as output out_dim += emb_dim # if self.cfg.use_bg_to_pred: # in_dim += bgf_dim if self.cfg.use_fg_to_pred == 2: in_dim += fgf_dim self.attention = Attention(config.attn_type, out_dim, in_dim) ################################################################# # Object decoder ################################################################# input_dim = tgt_dim if self.cfg.what_attn: input_dim += src_dim if self.cfg.attn_emb: input_dim += emb_dim if self.cfg.use_bg_to_pred: input_dim += bgf_dim if self.cfg.use_fg_to_pred == 2: input_dim += fgf_dim hidden_dim = tgt_dim self.decoder = nn.Sequential( nn.Linear(input_dim, hidden_dim, bias=True), nn.LeakyReLU(0.2, inplace=True), nn.Linear(hidden_dim, fgf_dim)) self.init_weights()