def forward(self, x, encoder_padding_mask): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(0, x, before=True) x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(0, x, after=True) residual = x x = self.maybe_layer_norm(1, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(1, x, after=True) return x
def forward(self, input, format ='index'): if format == 'onehot': out = F.dropout(self.Linear(input), self.d, training=self.training) elif format == 'index': out = F.dropout(self.word_embed(input), self.d, training=self.training) return out
def forward(self, x): x = F.relu(self.linear1(x)) x = F.dropout(x, 0.8) x = F.relu(self.linear2(x)) x = F.dropout(x, 0.8) x = F.log_softmax(self.linear3(x)) return x
def forward(self, h_out, fake_region, conv_feat, conv_feat_embed): # View into three dimensions att_size = conv_feat.numel() // conv_feat.size(0) // self.rnn_size conv_feat = conv_feat.view(-1, att_size, self.rnn_size) conv_feat_embed = conv_feat_embed.view(-1, att_size, self.att_hid_size) # view neighbor from bach_size * neighbor_num x rnn_size to bach_size x rnn_size * neighbor_num fake_region = self.fr_linear(fake_region) fake_region_embed = self.fr_embed(fake_region) h_out_linear = self.ho_linear(h_out) h_out_embed = self.ho_embed(h_out_linear) txt_replicate = h_out_embed.unsqueeze(1).expand(h_out_embed.size(0), att_size + 1, h_out_embed.size(1)) img_all = torch.cat([fake_region.view(-1,1,self.input_encoding_size), conv_feat], 1) img_all_embed = torch.cat([fake_region_embed.view(-1,1,self.input_encoding_size), conv_feat_embed], 1) hA = F.tanh(img_all_embed + txt_replicate) hA = F.dropout(hA,self.drop_prob_lm, self.training) hAflat = self.alpha_net(hA.view(-1, self.att_hid_size)) PI = F.softmax(hAflat.view(-1, att_size + 1)) visAtt = torch.bmm(PI.unsqueeze(1), img_all) visAttdim = visAtt.squeeze(1) atten_out = visAttdim + h_out_linear h = F.tanh(self.att2h(atten_out)) h = F.dropout(h, self.drop_prob_lm, self.training) return h
def _forward_unpadded(self, x, x_mask): """Faster encoding that ignores any padding.""" # Transpose batch and sequence dims x = x.transpose(0, 1) # Encode all layers outputs = [x] for i in range(self.num_layers): rnn_input = outputs[-1] # Apply dropout to hidden input if self.dropout_rate > 0: rnn_input = F.dropout(rnn_input, p=self.dropout_rate, training=self.training) # Forward rnn_output = self.rnns[i](rnn_input)[0] outputs.append(rnn_output) # Concat hidden layers if self.concat_layers: output = torch.cat(outputs[1:], 2) else: output = outputs[-1] # Transpose back output = output.transpose(0, 1) # Dropout on output layer if self.dropout_output and self.dropout_rate > 0: output = F.dropout(output, p=self.dropout_rate, training=self.training) return output
def forward(self, x): y = F.dropout(F.relu(self.linears[0](x)), self.training) for layer in self.linears[1:-1]: y = F.relu(layer(y)) y = F.dropout(y, self.training) y = F.log_softmax(self.linears[-1](y)) return y
def forward(self, src_tokens): bsz, seqlen = src_tokens.size() num_layers = len(self.layers) # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) embed_dim = x.size(2) # B x T x C -> T x B x C x = x.transpose(0, 1) final_hiddens, final_cells = [], [] outs = [x[j] for j in range(seqlen)] for i, rnn in enumerate(self.layers): hidden = Variable(x.data.new(bsz, embed_dim).zero_()) cell = Variable(x.data.new(bsz, embed_dim).zero_()) for j in range(seqlen): # recurrent cell hidden, cell = rnn(outs[j], (hidden, cell)) # store the most recent hidden state in outs, either to be used # as the input for the next layer, or as the final output outs[j] = F.dropout(hidden, p=self.dropout_out, training=self.training) # save the final hidden and cell states for every layer final_hiddens.append(hidden) final_cells.append(cell) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, embed_dim) final_hiddens = torch.cat(final_hiddens, dim=0).view(num_layers, bsz, embed_dim) final_cells = torch.cat(final_cells, dim=0).view(num_layers, bsz, embed_dim) return x, final_hiddens, final_cells
def forward(self, inp, hidden=None, schedule=None, **kwargs): """ Parameters: ----------- inp: torch.Tensor (seq_len x batch_size) Returns: -------- outs: torch.Tensor (seq_len * batch_size x vocab) hidden: see output of RNN, GRU, LSTM in torch.nn weights: None or list of weights (batch_size x seq_len), It will only be not None if attention is provided. """ inp = word_dropout( inp, self.target_code, p=self.word_dropout, reserved_codes=self.reserved_codes, training=self.training) emb = self.embeddings(inp) if self.has_dropout: emb = F.dropout(emb, p=self.dropout, training=self.training) outs, hidden = self.rnn(emb, hidden or self.init_hidden_for(emb)) if self.has_dropout: outs = F.dropout(outs, p=self.dropout, training=self.training) weights = None if self.add_attn: outs, weights = self.attn(outs, emb) seq_len, batch, hid_dim = outs.size() outs = outs.view(seq_len * batch, hid_dim) if self.add_deepout: outs = self.deepout(outs) outs = F.log_softmax(self.project(outs)) return outs, hidden, weights
def forward(self, x): x = x.view(-1, 28 * 28) x = F.relu(self.fc1(x)) x = F.dropout(x, p=0.8, training=self.training) x = F.relu(self.fc2(x)) x = F.dropout(x, p=0.8, training=self.training) x = self.fc3(x) return x
def forward(self, xt, img_fc, state): hs = [] cs = [] for L in range(self.num_layers): # c,h from previous timesteps prev_h = state[0][L] prev_c = state[1][L] # the input to this layer if L == 0: x = xt i2h = self.w2h(x) + self.v2h(img_fc) else: x = hs[-1] x = F.dropout(x, self.drop_prob_lm, self.training) i2h = self.i2h[L-1](x) all_input_sums = i2h+self.h2h[L](prev_h) sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size) sigmoid_chunk = F.sigmoid(sigmoid_chunk) # decode the gates in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size) forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size) out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size) # decode the write inputs if not self.use_maxout: in_transform = F.tanh(all_input_sums.narrow(1, 3 * self.rnn_size, self.rnn_size)) else: in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size) in_transform = torch.max(\ in_transform.narrow(1, 0, self.rnn_size), in_transform.narrow(1, self.rnn_size, self.rnn_size)) # perform the LSTM update next_c = forget_gate * prev_c + in_gate * in_transform # gated cells form the output tanh_nex_c = F.tanh(next_c) next_h = out_gate * tanh_nex_c if L == self.num_layers-1: if L == 0: i2h = self.r_w2h(x) + self.r_v2h(img_fc) else: i2h = self.r_i2h(x) n5 = i2h+self.r_h2h(prev_h) fake_region = F.sigmoid(n5) * tanh_nex_c cs.append(next_c) hs.append(next_h) # set up the decoder top_h = hs[-1] top_h = F.dropout(top_h, self.drop_prob_lm, self.training) fake_region = F.dropout(fake_region, self.drop_prob_lm, self.training) state = (torch.cat([_.unsqueeze(0) for _ in hs], 0), torch.cat([_.unsqueeze(0) for _ in cs], 0)) return top_h, fake_region, state
def forward(self, inputs): # inputs (batch size, "sentence" length) bs,n embeds = self.embeddings(inputs) # bs,n,300 embeds = embeds.view(-1,n*300) # bs,n*300 out = F.tanh(self.h(embeds)) # bs,hidden_size out = self.u(F.dropout(out,p=dropout_rate)) # bs,|V| embeds = F.dropout(embeds,p=dropout_rate) out += self.w(embeds) # bs,|V| #out = F.softmax(out,dim=1) return out
def hidden_to_idx(self, hidden, is_training=False): """Convert hidden state vectors into indices into the dictionary.""" # dropout at each step e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training) scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training) # skip zero (null_idx) when selecting a score _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2) # add one back to index since we removed first option return idx.add_(1), scores
def forward(self, input): x = F.leaky_relu(self.fc1(input), 0.2) x = F.dropout(x, 0.3) x = F.leaky_relu(self.fc2(x), 0.2) x = F.dropout(x, 0.3) x = F.leaky_relu(self.fc3(x), 0.2) x = F.dropout(x, 0.3) x = F.sigmoid(self.fc4(x)) return x
def forward(self, x): x = F.relu(self.conv1(x)) # 28x28x32 -> 26x26x32 x = F.relu(self.conv2(x)) # 26x26x32 -> 24x24x64 x = F.max_pool2d(x, 2) # 24x24x64 -> 12x12x64 x = F.dropout(x, p=0.25, training=self.training) x = x.view(-1, 12*12*64) # flatten 12x12x64 = 9216 x = F.relu(self.fc1(x)) # fc 9216 -> 128 x = F.dropout(x, p=0.5, training=self.training) x = self.fc2(x) # fc 128 -> 10 return F.log_softmax(x, dim=1) # to 10 logits
def _forward_padded(self, x, x_mask): """Slower (significantly), but more precise, encoding that handles padding.""" # Compute sorted sequence lengths lengths = x_mask.data.eq(0).long().sum(1).squeeze() _, idx_sort = torch.sort(lengths, dim=0, descending=True) _, idx_unsort = torch.sort(idx_sort, dim=0) lengths = list(lengths[idx_sort]) idx_sort = Variable(idx_sort) idx_unsort = Variable(idx_unsort) # Sort x x = x.index_select(0, idx_sort) # Transpose batch and sequence dims x = x.transpose(0, 1) # Pack it up rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths) # Encode all layers outputs = [rnn_input] for i in range(self.num_layers): rnn_input = outputs[-1] # Apply dropout to input if self.dropout_rate > 0: dropout_input = F.dropout(rnn_input.data, p=self.dropout_rate, training=self.training) rnn_input = nn.utils.rnn.PackedSequence(dropout_input, rnn_input.batch_sizes) outputs.append(self.rnns[i](rnn_input)[0]) # Unpack everything for i, o in enumerate(outputs[1:], 1): outputs[i] = nn.utils.rnn.pad_packed_sequence(o)[0] # Concat hidden layers or take final if self.concat_layers: output = torch.cat(outputs[1:], 2) else: output = outputs[-1] # Transpose and unsort output = output.transpose(0, 1) output = output.index_select(0, idx_unsort) # Dropout on output layer if self.dropout_output and self.dropout_rate > 0: output = F.dropout(output, p=self.dropout_rate, training=self.training) return output
def forward(self, input, hidden): # input is (sentence length, batch size) n,bs # hidden is ((n_layers,bs,hidden_size),(n_layers,bs,hidden_size)) embeds = self.embedding(input) # n,bs,300 # batch goes along the second dimension out = F.dropout(embeds,p=dropout_rate) out, hidden = self.lstm(out, hidden) out = F.dropout(out,p=dropout_rate) # apply the linear and the softmax out = self.linear(out) # n,bs,|V| #out = self.softmax(out) # This was originally the output. (SG: I see this is LogSoftmax) return out, hidden
def _forward(self, input_tokens, positions, encoder_out): # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed tokens and positions x = self.embed_tokens(input_tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_unless_incremental_eval(x) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = conv.remove_future_timesteps(x) x = F.glu(x) # attention if attention is not None: x = self._transpose_unless_incremental_eval(x) x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b)) attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_unless_incremental_eval(x) # residual x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = self._transpose_unless_incremental_eval(x) # project back to size of vocabulary x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores
def forward(self, xs, hidden, encoder_output, attn_mask=None): xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training) xes = self.attention(xes, hidden, encoder_output, attn_mask) output, new_hidden = self.rnn(xes, hidden) # TODO: add post-attention? # output = self.attention(output, new_hidden, encoder_output, attn_mask) e = F.dropout(self.o2e(output), p=self.dropout, training=self.training) scores = F.dropout(self.e2s(e), p=self.dropout, training=self.training) # select top scoring index, excluding the padding symbol (at idx zero) _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2) preds = idx.add_(1) return preds, scores, new_hidden
def forward(self, x1, x2): x1 = F.dropout(F.relu(self.layer1_1(x1.view(-1, 784))), self.drop) x2 = F.dropout(F.relu(self.layer1_2(x2.view(-1, 784))), self.drop) x = F.dropout(F.relu(self.layer2(torch.cat((x1, x2), 1))), self.drop) x = F.dropout(F.relu(self.layer3(x)), self.drop) x = F.dropout(F.relu(self.layer4(x)), self.drop) out1 = F.relu(self.layer5_1(x)) out1 = F.sigmoid(self.layer6_1(out1)) out2 = F.relu(self.layer5_2(x)) out2 = F.sigmoid(self.layer6_2(out2)) return out1, out2
def forward(self, x, encoder_out, encoder_padding_mask, incremental_state): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) x, _ = self.self_attn( query=x, key=x, value=x, mask_future_timesteps=True, incremental_state=incremental_state, need_weights=False, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) attn = None if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) return x, attn
def forward(self, text_sequences, text_positions=None, lengths=None, speaker_embed=None): assert self.n_speakers == 1 or speaker_embed is not None # embed text_sequences x = self.embed_tokens(text_sequences.long()) x = F.dropout(x, p=self.dropout, training=self.training) # expand speaker embedding for all time steps speaker_embed_btc = None input_embedding = x # B x T x C -> B x C x T x = x.transpose(1, 2) # 1D conv blocks for f in self.convolutions: x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x) # Back to B x T x C keys = x.transpose(1, 2) # scale gradients (this only affects backward, not forward) # add output to input embedding for attention values = (keys + input_embedding) * math.sqrt(0.5) return keys, values
def forward(self, emb, hidden): ques_feat, hidden = self.ques_rnn(emb, hidden) concat_feat = F.dropout(ques_feat[-1], self.d, training=self.training) encoder_feat = F.tanh(self.fc1(concat_feat)) return encoder_feat, hidden
def forward(self, x): x = self.conv1(x) x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2) x = self.block1(x) x = self.group1(x) x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2) x = self.block2(x) x = self.group2(x) x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2) x = self.block3(x) x = self.group3(x) x = self.block4(x) x = self.group4(x) x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2) x = x.view(x.size(0), -1) fc = self.fc(x) x = F.dropout(fc, training=self.training) output = list() for name, fun in self.fc_dict.iteritems(): out = fun(x) output.append(out) return output, fc
def forward(self, X, posterior_mean = False): """ Funciton call to generate the output, every time we call it, the dynamic graph is created. There can be difference between forward in training and test: - In dropout we do not zero neurons in test - In Variational Inference we dont randombly sample from the posterior We create the forward pass by performing operations between the input X (Nsam_batch, Ndim) and the parameters of the model that we should have initialized in the __init__ """ ## We need to sample from the posterior !! self.sample_posterior(posterior_mean) o1 = self.linear1(X) # o1 = torch.mm(X, self.W1) + self.b1 # print ("x shape: ", X.shape, "W1 shape: ", self.W1.shape, "b1 shape: ", self.b1.shape) # print ("o1 shape: ", o1.shape) # print ("W2 shape: ", self.W2.shape, "b2 shape: ", self.b2.shape) ## Apply non-linearity o1 = self.cf_a.activation_func(o1) o1 = F.dropout(o1,p = self.cf_a.dop, training = self.training) o2 = torch.mm(o1, self.W2) + self.b2 # print ("o2 shape: ", o2.shape) return o2
def forward(self, xs): bsz = len(xs) # embed input tokens xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training) x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True) zeros = self.zeros(xs) if zeros.size(1) != bsz: zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0) h0 = Variable(zeros, requires_grad=False) if type(self.rnn) == nn.LSTM: encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0)) # take elementwise max between forward and backward hidden states hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0], hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0]) else: encoder_output_packed, hidden = self.rnn(xes_packed, h0) # take elementwise max between forward and backward hidden states hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0] encoder_output, _ = pad_packed_sequence(encoder_output_packed, batch_first=True) return encoder_output, hidden
def forward(self, input_): emb_input = self._embedding(input_) conv_in = F.dropout(emb_input.transpose(1, 2), self._dropout, training=self.training) output = torch.cat([F.relu(conv(conv_in)).max(dim=2)[0] for conv in self._convs], dim=1) return output
def forward(self, prev, hidden, enc_outs, out=None, enc_att=None, mask=None): """ Parameters: ----------- prev: torch.Tensor (batch x emb_dim), Previously decoded output. hidden: Used to seed the initial hidden state of the decoder. h_t: (enc_num_layers x batch x hid_dim) c_t: (enc_num_layers x batch x hid_dim) enc_outs: torch.Tensor (seq_len x batch x enc_hid_dim), Output of the encoder at the last layer for all encoding steps. """ if self.add_prev: # include last out as input for the prediction of the next item inp = torch.cat([prev, out or self.init_output_for(hidden)], 1) else: inp = prev out, hidden = self.rnn_step(inp, hidden) # out (batch x hid_dim), att_weight (batch x seq_len) out, att_weight = self.attn(out, enc_outs, enc_att=enc_att, mask=mask) out = F.dropout(out, p=self.dropout, training=self.training) if self.has_maxout: out = self.maxout(torch.cat([out, prev], 1)) return out, hidden, att_weight
def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.fc1(x) x = F.dropout(x, training=self.training) out = self.fc2(x) return out, x
def forward(self, s): # s: batch_size x board_x x board_y s = s.view(-1, 1, self.board_x, self.board_y) # batch_size x 1 x board_x x board_y s = F.relu(self.bn1(self.conv1(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn2(self.conv2(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn3(self.conv3(s))) # batch_size x num_channels x (board_x-2) x (board_y-2) s = F.relu(self.bn4(self.conv4(s))) # batch_size x num_channels x (board_x-4) x (board_y-4) s = s.view(-1, self.args.num_channels*(self.board_x-4)*(self.board_y-4)) s = F.dropout(F.relu(self.fc_bn1(self.fc1(s))), p=self.args.dropout, training=self.training) # batch_size x 1024 s = F.dropout(F.relu(self.fc_bn2(self.fc2(s))), p=self.args.dropout, training=self.training) # batch_size x 512 pi = self.fc3(s) # batch_size x action_size v = self.fc4(s) # batch_size x 1 return F.log_softmax(pi, dim=1), F.tanh(v)
def forward(input, hidden, weight): assert(len(weight) == total_layers) next_hidden = [] if lstm: hidden = list(zip(*hidden)) for i in range(num_layers): all_output = [] for j, inner in enumerate(inners): l = i * num_directions + j hy, output = inner(input, hidden[l], weight[l]) next_hidden.append(hy) all_output.append(output) input = torch.cat(all_output, input.dim() - 1) if dropout != 0 and i < num_layers - 1: input = F.dropout(input, p=dropout, training=train, inplace=False) if lstm: next_h, next_c = zip(*next_hidden) next_hidden = ( torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) ) else: next_hidden = torch.cat(next_hidden, 0).view( total_layers, *next_hidden[0].size()) return next_hidden, input
def logits(self, x): x = self.global_pool(x) if self.drop_rate > 0.: x = F.dropout(x, p=self.drop_rate, training=self.training) x = self.last_linear(x) return x
def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index, None)) x = F.dropout(x, training=self.training) x = self.conv2(x, edge_index, None) return F.log_softmax(x, dim=1)
def forward(self, emb_question, question_length, emb_support, support_length, unique_word_chars, unique_word_char_length, question_words2unique, support_words2unique, word_in_question, correct_start, answer2support, is_eval): """fast_qa model Args: emb_question: [Q, L_q, N] question_length: [Q] emb_support: [Q, L_s, N] support_length: [Q] unique_word_chars unique_word_char_length question_words2unique support_words2unique word_in_question: [Q, L_s] correct_start: [A], only during training, i.e., is_eval=False answer2question: [A], only during training, i.e., is_eval=False is_eval: [] Returns: start_scores [B, L_s, N], end_scores [B, L_s, N], span_prediction [B, 2] """ # Some helpers float_tensor = torch.cuda.FloatTensor if emb_question.is_cuda else torch.FloatTensor long_tensor = torch.cuda.LongTensor if emb_question.is_cuda else torch.LongTensor batch_size = question_length.data.shape[0] max_question_length = question_length.max().data[0] support_mask = misc.mask_for_lengths(support_length) question_binary_mask = misc.mask_for_lengths(question_length, mask_right=False, value=1.0) if self._with_char_embeddings: # compute combined embeddings [char_emb_question, char_emb_support] = self._conv_char_embedding( unique_word_chars, unique_word_char_length, [question_words2unique, support_words2unique]) emb_question = torch.cat([emb_question, char_emb_question], 2) emb_support = torch.cat([emb_support, char_emb_support], 2) # compute encoder features question_features = torch.autograd.Variable( torch.ones(batch_size, max_question_length, 2, out=float_tensor())) question_features = question_features.type_as(emb_question) v_wiqw = self._v_wiq_w # [B, L_q, L_s] wiq_w = torch.matmul(emb_question * v_wiqw, emb_support.transpose(1, 2)) # [B, L_q, L_s] wiq_w = wiq_w + support_mask.unsqueeze(1) wiq_w = F.softmax(wiq_w.view(batch_size * max_question_length, -1)).view(batch_size, max_question_length, -1) # [B, L_s] wiq_w = torch.matmul(question_binary_mask.unsqueeze(1), wiq_w).squeeze(1) # [B, L , 2] support_features = torch.stack([word_in_question, wiq_w], dim=2) if self._with_char_embeddings: # highway layer to allow for interaction between concatenated embeddings emb_question = self._embedding_projection(emb_question) emb_support = self._embedding_projection(emb_support) emb_question = self._embedding_highway(emb_question) emb_support = self._embedding_highway(emb_support) # dropout dropout = self._shared_resources.config.get("dropout", 0.0) emb_question = F.dropout(emb_question, dropout, training=not is_eval) emb_support = F.dropout(emb_support, dropout, training=not is_eval) # extend embeddings with features emb_question_ext = torch.cat([emb_question, question_features], 2) emb_support_ext = torch.cat([emb_support, support_features], 2) # encode question and support # [B, L, 2 * size] encoded_question = self._bilstm(emb_question_ext)[0] encoded_support = self._bilstm(emb_support_ext)[0] # [B, L, size] encoded_support = F.tanh( F.linear(encoded_support, self._support_projection)) encoded_question = F.tanh( F.linear(encoded_question, self._question_projection)) start_scores, end_scores, predicted_start_pointer, predicted_end_pointer = \ self._answer_layer(encoded_question, question_length, encoded_support, support_length, correct_start, answer2support, is_eval) # no multi paragraph support yet doc_idx = torch.autograd.Variable( torch.zeros(predicted_start_pointer.data.shape[0], out=long_tensor())) span = torch.stack( [doc_idx, predicted_start_pointer, predicted_end_pointer], 1) return start_scores, end_scores, span
def forward(self, x): x = super(DropoutConv2d, self).forward(x) x = F.dropout(x, p=self.drop, training=True) return x
def test_dropout(self): x = torch.randn(3, 4, requires_grad=True) self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x)
def forward(self, x): x = self.fc2(F.dropout(self.bn(self.fc1(x)))) return x
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # map oov tokens to unk tokens prev_output_tokens = prev_output_tokens.masked_fill( prev_output_tokens >= self.embed_tokens.num_embeddings, self.dictionary.unk_index) # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) copy_attn, copy_alpha = None, None if self.copy_attention: assert encoder_out is not None, \ "--copy-attn can't be used with decoder only architecture" x_copy, copy_attn = self.copy_attn_layer( query=x, key=encoder_out['encoder_out'], value=encoder_out['encoder_out'], key_padding_mask=encoder_out['encoder_padding_mask'], incremental_state=incremental_state, static_kv=True, need_weights=True, ) x_copy = x_copy.transpose(0, 1) copy_alpha = torch.sigmoid(self.copy_alpha_linear(x_copy)) attn = copy_attn # use copy attn for alignment if self.normalize: x = self.layer_norm(x) # todo: layer norm # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states, 'copy_attn': copy_attn, 'copy_alpha': copy_alpha, 'src_tokens': encoder_out['src_tokens']}
def forward(self, input_v, input_q): # input_v和input_q的维度都是2, (batchsize, d_v) ,统一输入图像和问题的维度 if input_v.dim() != input_q.dim() and input_v.dim() != 2: raise ValueError batch_size = input_v.size(0) # 分别处理图像和问题嵌入 # dropout-->linear(d_v2048/d_q2400--》310)-->tanh if self.visual_embedding: x_v = F.dropout(input_v, p=self.opt['dropout_v'], training=self.training) x_v = self.linear_v(x_v) if 'activation_v' in self.opt: x_v = getattr(F, self.opt['activation_v'])(x_v) else: x_v = input_v if self.question_embedding: x_q = F.dropout(input_q, p=self.opt['dropout_q'], training=self.training) x_q = self.linear_q(x_q) if 'activation_q' in self.opt: x_q = getattr(F, self.opt['activation_q'])(x_q) else: x_q = input_q # 秩R的约束,(论文中)Z表示成R个Zr的总和(Z会投影到预测空间y上)。 # 处理后的图像和问题,使用了对应位的相乘, # 使用堆叠求和方式进行相加,最终得到的x_mm相当于文章的Z x_mm = [] for i in range(self.opt['R']): # 用for循环对R个映射独立的进行映射,存储到x_mm # 分别处理,图像和问题嵌入 # dropout-->linear(310--》510)-->tanh x_hv = F.dropout(x_v, p=self.opt['dropout_hv'], training=self.training) x_hv = self.list_linear_hv[i](x_hv) # linear后大小变510, if 'activation_hv' in self.opt: # tanh x_hv = getattr(F, self.opt['activation_hv'])(x_hv) x_hq = F.dropout(x_q, p=self.opt['dropout_hq'], training=self.training) x_hq = self.list_linear_hq[i](x_hq) if 'activation_hq' in self.opt: x_hq = getattr(F, self.opt['activation_hq'])(x_hq) # x_mm.append(torch.mul(x_hq, x_hv)) # 使用mul()对应位相乘进行融合,这样融合之后大小不变,但是有R个 # x_mm([batchsize,510],,,,R个,,,[batchsize,510]), x_mm = torch.stack(x_mm, dim=1) # R个,,在维度1堆起来, x_mm = x_mm.sum(1).view( batch_size, self.opt['dim_mm']) # dim1求和,恢复原来大小(batchsize,510) if 'activation_mm' in self.opt: x_mm = getattr(F, self.opt['activation_mm'])( x_mm) # activation_mm = softmax # 这就是模型的输出,output,用来预测答案。 return x_mm
def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def forward(self, x): x = self.fc2(x) x = F.relu(x) x = F.dropout(x) x = self.fc3(x) return F.log_softmax(x)
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, need_weights=True, static_kv=False, self_attn_mask=None, ngram_mask_matrix=None, i_buckets_main_stream=None, i_bucket_relative_stream=None, real_positions=None): """Input shape: Time x Batch x Channel Timesteps can be masked by supplying a T x T mask in the `attn_mask` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if 'prev_key' in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert (self.encoder_decoder_attention and not self.self_attention) key = value = None else: saved_state = None q, k, v = self.in_proj_qkv(query) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) # input hidden states h_list = query.chunk(1 + self.ngram, dim=0) q_list = q.chunk(1 + self.ngram, dim=1) k_list = k.chunk(1 + self.ngram, dim=1) v_list = v.chunk(1 + self.ngram, dim=1) h_main, h_predict_list = h_list[0], h_list[1:] q_main, q_predict_list = q_list[0], q_list[1:] k_main, k_predict_list = k_list[0], k_list[1:] v_main, v_predict_list = v_list[0], v_list[1:] if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, # head_dim) if 'prev_key' in saved_state: prev_key = saved_state['prev_key'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: assert False, 'static_kv not supprt in ngram decoder' k = prev_key else: k_main = torch.cat((prev_key, k_main), dim=1) if 'prev_value' in saved_state: prev_value = saved_state['prev_value'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: v_main = torch.cat((prev_value, v_main), dim=1) saved_state['prev_key'] = k_main.view(bsz, self.num_heads, -1, self.head_dim) saved_state['prev_value'] = v_main.view(bsz, self.num_heads, -1, self.head_dim) self._set_input_buffer(incremental_state, saved_state) real_tgt_len = tgt_len // (1 + self.ngram) attn_weights_main = torch.bmm(q_main, k_main.transpose(1, 2)) main_relative_logits = self.main_stream_relative_logits( h_main, attn_weights_main, real_positions, i_buckets_main_stream) attn_weights_main = attn_weights_main + main_relative_logits if self_attn_mask is not None: self_attn_mask = self_attn_mask.unsqueeze(0) attn_weights_main = attn_weights_main + self_attn_mask attn_weights_main = utils.softmax( attn_weights_main, dim=-1, onnx_trace=self.onnx_trace, ).type_as(attn_weights_main) attn_weights_main = F.dropout(attn_weights_main, p=self.dropout, training=self.training) attn_main = torch.bmm(attn_weights_main, v_main) attn_main = attn_main.transpose(0, 1).contiguous().view( 1, real_tgt_len, bsz, embed_dim) attn_main = self.out_proj(attn_main) # [ngram, B*head, T, c] q_ngram = torch.cat(q_predict_list, 0).view(self.ngram, -1, real_tgt_len, self.head_dim) # [ngram, B*head, 2*T, c] k_ngram = torch.cat([ torch.cat([k_main, k_p], 1).unsqueeze(0) for k_p in k_predict_list ], 0) # below code slower than above for loop # k_ngram = torch.cat([k_main.unsqueeze(0).repeat(self.ngram, 1, 1, 1), # torch.cat(k_predict_list).view( # self.ngram, -1, real_tgt_len, self.head_dim) # ], 2) # [ngram, T, B, C] h_ngram = torch.cat(h_predict_list, 0).view(self.ngram, real_tgt_len, bsz, embed_dim) # [ngram, B*head, 2*T, c] v_ngram = torch.cat([ torch.cat([v_main, v_p], 1).unsqueeze(0) for v_p in v_predict_list ], 0) # below code slower than above for loop # v_ngram = torch.cat([v_main.unsqueeze(0).repeat(self.ngram, 1, 1, 1), # torch.cat(v_predict_list).view(self.ngram, -1, # real_tgt_len, self.head_dim)], 2) # [ngram, B*head, T, 2*T] attn_weights_ngram = torch.einsum('nbtc,nbsc->nbts', (q_ngram, k_ngram)) # [ngram, B*head, T, S] predict_relative_logits = self.ngram_relative_logits( h_ngram, attn_weights_ngram, real_positions, i_bucket_relative_stream) # [ngram, B*head, T, 2*T] attn_weights_ngram = attn_weights_ngram + predict_relative_logits if ngram_mask_matrix is not None: ngram_mask_matrix = ngram_mask_matrix.unsqueeze(1) attn_weights_ngram = attn_weights_ngram + ngram_mask_matrix attn_weights_ngram = utils.softmax( attn_weights_ngram, dim=-1, onnx_trace=self.onnx_trace, ).type_as(attn_weights_ngram) attn_weights_ngram = F.dropout(attn_weights_ngram, p=self.dropout, training=self.training) # [ngram, B*head, T, c] attn_ngram = torch.einsum('nbts,nbsc->nbtc', (attn_weights_ngram, v_ngram)) # [ngram, T, B, C] attn_ngram = attn_ngram.transpose(1, 2).contiguous().view( self.ngram, real_tgt_len, bsz, embed_dim) attn_ngram = self.out_proj(attn_ngram) attn_result = [] attn_result.append(attn_main) attn_result.append(attn_ngram) # [1+ngram*T, B, C] attn = torch.cat(attn_result, 0).view(-1, bsz, embed_dim) return attn, None
def fwd(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, langs=None, cache=None): """ Inputs: `x` LongTensor(slen, bs), containing word indices `lengths` LongTensor(bs), containing the length of each sentence `causal` Boolean, if True, the attention is only done over previous hidden states `positions` LongTensor(slen, bs), containing word positions `langs` LongTensor(slen, bs), containing language IDs """ # lengths = (x != self.pad_index).float().sum(dim=1) # mask = x != self.pad_index # check inputs slen, bs = x.size() assert lengths.size(0) == bs assert lengths.max().item() <= slen x = x.transpose(0, 1) # batch size as dimension 0 assert (src_enc is None) == (src_len is None) if src_enc is not None: assert self.is_decoder assert src_enc.size(0) == bs # generate masks mask, attn_mask = get_masks(slen, lengths, causal) if self.is_decoder and src_enc is not None: src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # positions if positions is None: positions = x.new(slen).long() positions = torch.arange(slen, out=positions).unsqueeze(0) else: assert positions.size() == (slen, bs) positions = positions.transpose(0, 1) # langs if langs is not None: assert langs.size() == (slen, bs) langs = langs.transpose(0, 1) # do not recompute cached elements if cache is not None: _slen = slen - cache['slen'] x = x[:, -_slen:] positions = positions[:, -_slen:] if langs is not None: langs = langs[:, -_slen:] mask = mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:] # embeddings tensor = self.embeddings(x) tensor = tensor + self.position_embeddings(positions).expand_as(tensor) if langs is not None and self.use_lang_emb: tensor = tensor + self.lang_embeddings(langs) tensor = self.layer_norm_emb(tensor) tensor = F.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers for i in range(self.n_layers): # self attention attn = self.attentions[i](tensor, attn_mask, cache=cache) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) if self.is_decoder and src_enc is not None: attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) attn = F.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm15[i](tensor) # FFN if ('%i_in' % i) in self.memories: tensor = tensor + self.memories['%i_in' % i](tensor) else: tensor = tensor + self.ffns[i](tensor) tensor = self.layer_norm2[i](tensor) # memory if ('%i_after' % i) in self.memories: tensor = tensor + self.memories['%i_after' % i](tensor) # TODO: add extra layer norm here? tensor *= mask.unsqueeze(-1).to(tensor.dtype) # update cache length if cache is not None: cache['slen'] += tensor.size(1) # move back sequence length to dimension 0 tensor = tensor.transpose(0, 1) return tensor
def forward(self, x): x = F.dropout(F.relu(self.bn1(self.fc1(x))), training=self.training) x = F.dropout(F.relu(self.bn2(self.fc2(x))), training=self.training) x = self.fc3(x) return x
def forward(self, x): N=self.N conv1=self.conv1_1(x) conv1=F.relu(conv1,inplace=True) conv1=self.conv1_2(conv1) conv1=F.relu(conv1,inplace=True) pool1=F.max_pool2d(conv1,(2,2)) conv2=self.conv2_1(pool1) conv2=F.relu(conv2,inplace=True) conv2=self.conv2_2(conv2) conv2=F.relu(conv2,inplace=True) pool2=F.max_pool2d(conv2,(2,2)) conv3=self.conv3_1(pool2) conv3=F.relu(conv3,inplace=True) conv3=self.conv3_2(conv3) conv3=F.relu(conv3,inplace=True) conv3=F.dropout(conv3,p=0.5, training=self.training) pool3=F.max_pool2d(conv3,(2,2)) #D1 conv4_1=self.conv4_1_1(pool3) conv4_1=F.relu(conv4_1,inplace=True) conv4_1=self.conv4_1_2(conv4_1) conv4_1=F.relu(conv4_1,inplace=True) conv4_1=F.dropout(conv4_1,p=0.5,training=self.training) #D2 conv4_2=self.conv4_2_1(conv4_1) conv4_2=F.relu(conv4_2,inplace=True) conv4_2=self.conv4_2_2(conv4_2) conv4_2=F.relu(conv4_2,inplace=True) conv4_2=F.dropout(conv4_2,p=0.5,training=self.training) #D3 merge_dense=torch.cat([conv4_2,conv4_1],dim=1) conv4_3=self.conv4_3_1(merge_dense) conv4_3=F.relu(conv4_3,inplace=True) conv4_3=self.conv4_3_2(conv4_3) conv4_3=F.relu(conv4_3,inplace=True) conv4_3=F.dropout(conv4_3,p=0.5,training=self.training) up6=self.upconv1(conv4_3) up6=self.upconv1_conv(up6) up6=self.BN1(up6) up6=F.relu(up6,inplace=True) x1=torch.reshape(conv3,(1,-1,256,int(N/4),int(N/4))) x2 = torch.reshape(up6, (1,-1, 256, int(N/4), int(N/4))) merge6=torch.cat([x2,x1],dim=0) merge6 = self.convlstm1(merge6) conv6=self.conv6_1(merge6) conv6=F.relu(conv6,inplace=True) conv6=self.conv6_2(conv6) conv6=F.relu(conv6,inplace=True) up7=self.upconv2(conv6) up7=self.upconv2_conv(up7) up7=self.BN2(up7) up7=F.relu(up7,inplace=True) x1 = torch.reshape(conv2, (1, -1, 128, int(N / 2), int(N / 2))) x2 = torch.reshape(up7, (1, -1, 128, int(N / 2), int(N / 2))) merge7 = torch.cat([x2, x1], dim=0) merge7 =self.convlstm2(merge7) conv7 = self.conv7_1(merge7) conv7 = F.relu(conv7,inplace=True) conv7 = self.conv7_2(conv7) conv7 = F.relu(conv7,inplace=True) up8 = self.upconv3(conv7) up8=self.upconv3_conv(up8) up8 = self.BN3(up8) up8 = F.relu(up8,inplace=True) x1 = torch.reshape(conv1, (1, -1, 64, N, N)) x2 = torch.reshape(up8, (1, -1, 64, N, N)) merge8 = torch.cat([x2, x1], dim=0) merge8 = self.convlstm3(merge8) conv8=self.conv8_1(merge8) conv8=F.relu(conv8,inplace=True) conv8 = self.conv8_2(conv8) conv8 = F.relu(conv8,inplace=True) conv8 = self.conv8_3(conv8) conv8 = F.relu(conv8,inplace=True) conv9=self.conv9(conv8) conv9=torch.sigmoid(conv9) return conv9
def forward(self, x, encoder_out, encoder_padding_mask, incremental_state, prev_self_attn_state=None, prev_attn_state=None, self_attn_mask=None, self_attn_padding_mask=None): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) if prev_self_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_self_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.self_attn._set_input_buffer(incremental_state, saved_state) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) attn = None if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) if prev_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = F.relu(self.fc1(x)) x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) if self.onnx_trace: saved_state = self.self_attn._get_input_buffer(incremental_state) self_attn_state = saved_state["prev_key"], saved_state["prev_value"] return x, attn, self_attn_state return x, attn
def forward(self, input): x = self.lin1(input) x = self.act(x) x = self.lin2(x) x = F.dropout(x, p=self.dropout, training=self.training) return x
def forward(self, x): for linear in self.layers: x = F.dropout(F.relu(linear(x)), p=0.5, training=True) return x
def forward(self, query, key, value, mask_future_timesteps=False, key_padding_mask=None, incremental_state=None, need_weights=True, static_kv=False): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Future timesteps can be masked with the `mask_future_timesteps` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr() kv_same = key.data_ptr() == value.data_ptr() tgt_len, bsz, embed_dim = query.size() if embed_dim != self.embed_dim: print("| x: {}, multi_head: {}".format(embed_dim, self.embed_dim)) assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if 'prev_key' in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert kv_same and not qkv_same key = value = None else: saved_state = None if qkv_same: # self-attention q, k, v = self.in_proj_qkv(query) elif kv_same: # encoder-decoder attention q = self.in_proj_q(query) if key is None: assert value is None # this will allow us to concat it with previous value and get # just get the previous value k = v = q.new(0) else: k, v = self.in_proj_kv(key) else: q = self.in_proj_q(query) k = self.in_proj_k(key) v = self.in_proj_v(value) q *= self.scaling if saved_state is not None: if 'prev_key' in saved_state: k = torch.cat((saved_state['prev_key'], k), dim=0) if 'prev_value' in saved_state: v = torch.cat((saved_state['prev_value'], v), dim=0) saved_state['prev_key'] = k saved_state['prev_value'] = v self._set_input_buffer(incremental_state, saved_state) src_len = k.size(0) if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) k = k.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) v = v.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] # only apply masking at training time (when incremental state is None) if mask_future_timesteps and incremental_state is None: assert query.size() == key.size(), \ 'mask_future_timesteps only applies to self-attention' attn_weights += self.buffered_mask(attn_weights).unsqueeze(0) if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.float().masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'), ).type_as(attn_weights) # FP16 support: cast to float and back attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) # Local attention pos_diff = torch.abs( torch.arange(tgt_len).unsqueeze(1) - torch.arange(src_len).unsqueeze(0)).float().cuda() variance = self.vars if hasattr(self, 'vars') else None local_mask = self.penalty(pos_diff, bsz, variance) attn_weights = attn_weights - local_mask attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights) attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training) attn = torch.bmm(attn_weights, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) if need_weights: # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads else: attn_weights = None return attn, attn_weights
def forward(self, x): x = self.conv1(x) # add_scalar x = x + 3 # mul_scalar x = x * 3 # add_scalar_out x += 3 # mul_scalar_out x *= 3 # add_scalar_relu x = x + 3 x = F.relu(x) # add_scalar_relu_out x += 3 x = F.relu(x) # mul_scalar_relu x = x * 3 x = F.relu(x) # mul_scalar_relu_out x *= 3 x = F.relu(x) x = self.maxpool1d(x) x = self.maxpool2d(x) x = self.maxpool3d(x) x = torch.flatten(x) x = torch.max(x) x = torch.min(x) x = x.reshape([-1]) x = x.resize_(1, 1, x.numel()) x = x.view(-1) # prim::ListConstruct xs = [x, x] # prim::ListUnpack x, y = xs # prim::TupleConstruct xs = (x, x) # prim::TupleUnpack x, y = xs x = x.transpose(1, 2) x = x.contiguous() x, y = torch.chunk(x, 2) x = F.dropout(x) x = self.dropout(x) x, _ = torch.sort(x) x = x.permute(0, 2, 3, 1) x = x.repeat_interleave(3, 1) x = torch.repeat_interleave(x, 3, 1) x = self.relu(x) x = F.relu(x) x = F.relu(x, inplace=True) x = x.relu() x.relu_() x = x.squeeze(0) x.squeeze_(0) x = torch.squeeze(x, 0) x = x.unsqueeze(0) x.unsqueeze_(0) x = torch.unsqueeze(x, 0) x = x.detach() x.detach_() x = x.repeat(4, 2) y = [] y.append(x) z = torch.stack(y, 0) z = [z, z] x, _ = z x = self.conv2(x) return x
def forward(self, x): if self.transform_input: x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5 x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5 x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5 x = torch.cat((x_ch0, x_ch1, x_ch2), 1) # N x 3 x 299 x 299 x = self.Conv2d_1a_3x3(x) # N x 32 x 149 x 149 x = self.Conv2d_2a_3x3(x) # N x 32 x 147 x 147 x = self.Conv2d_2b_3x3(x) # N x 64 x 147 x 147 x = self.MaxPool2b(x) # N x 64 x 73 x 73 x = self.Conv2d_3b_1x1(x) # N x 80 x 73 x 73 x = self.Conv2d_4a_3x3(x) # N x 192 x 71 x 71 x = self.MaxPool4a(x) # N x 192 x 35 x 35 x = self.Mixed_5b(x) # N x 256 x 35 x 35 x = self.Mixed_5c(x) # N x 288 x 35 x 35 x = self.Mixed_5d(x) # N x 288 x 35 x 35 x = self.Mixed_6a(x) # N x 768 x 17 x 17 x = self.Mixed_6b(x) # N x 768 x 17 x 17 x = self.Mixed_6c(x) # N x 768 x 17 x 17 x = self.Mixed_6d(x) # N x 768 x 17 x 17 x = self.Mixed_6e(x) # N x 768 x 17 x 17 aux_defined = self.training and self.aux_logits if aux_defined: aux = self.AuxLogits(x) else: aux = None # N x 768 x 17 x 17 x = self.Mixed_7a(x) # N x 1280 x 8 x 8 x = self.Mixed_7b(x) # N x 2048 x 8 x 8 x = self.Mixed_7c(x) # N x 2048 x 8 x 8 # Adaptive average pooling x = self.AvgPool(x) # N x 2048 x 1 x 1 x = F.dropout(x, training=self.training) # N x 2048 x 1 x 1 x = torch.flatten(x, 1) # N x 2048 x = self.fc(x) # N x 1000 (num_classes) return self.eager_outputs(x, aux)
def forward(self, x): out = F.relu(self.fc1(x)) out = F.dropout(out, self.dropout) out = self.fc2(out) return out
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, need_weights=True, static_kv=False, attn_mask=None): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Timesteps can be masked by supplying a T x T mask in the `attn_mask` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr() kv_same = key.data_ptr() == value.data_ptr() tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if 'prev_key' in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert kv_same and not qkv_same key = value = None else: saved_state = None if qkv_same: # self-attention q, k, v = self.in_proj_qkv(query) elif kv_same: # encoder-decoder attention q = self.in_proj_q(query) if key is None: assert value is None k = v = None else: k, v = self.in_proj_kv(key) else: q = self.in_proj_q(query) k = self.in_proj_k(key) v = self.in_proj_v(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1) ], dim=1) q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if 'prev_key' in saved_state: prev_key = saved_state['prev_key'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: k = torch.cat((prev_key, k), dim=1) if 'prev_value' in saved_state: prev_value = saved_state['prev_value'].view( bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: v = torch.cat((prev_value, v), dim=1) saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim) self._set_input_buffer(incremental_state, saved_state) src_len = k.size(1) if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1) if key_padding_mask is not None: key_padding_mask = torch.cat([ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask) ], dim=1) attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list( attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if self.onnx_trace: attn_weights = torch.where( key_padding_mask.unsqueeze(1).unsqueeze(2), torch.Tensor([float("-Inf")]), attn_weights.float()).type_as(attn_weights) else: attn_weights = attn_weights.float().masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'), ).type_as(attn_weights) # FP16 support: cast to float and back attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights) attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training) attn = torch.bmm(attn_weights, v) assert list( attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if (self.onnx_trace and attn.size(1) == 1): # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) if need_weights: # average attention weights over heads attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.sum(dim=1) / self.num_heads else: attn_weights = None return attn, attn_weights
def forward(self, data, fold_id, mode): x, adj = data[:2] x_ori = x random_walks = data[6] random_walks = random_walks.reshape( random_walks.size()[0] * random_walks.size()[1], random_walks.size()[2], random_walks.size()[3]) # Node random walk model feature extractor node_random_walk_model = self.node_random_walk_models[fold_id] if mode == 'train': node_random_walk_model.train() else: node_random_walk_model.eval() LSTM_feature_extractor = torch.nn.Sequential( *list(node_random_walk_model.children())[:-2]) LSTM_feature_extractor_output = LSTM_feature_extractor(random_walks) x = torch.mean(LSTM_feature_extractor_output[0], dim=1).squeeze() x = x.reshape(adj.size()[0], adj.size()[1], x.size()[-1]) x = readout_function(x, self.readout) node_random_walk_model_feature_extractor = torch.nn.Sequential( *list(node_random_walk_model.children())[1:-1]) node_random_walk_model_feature_extractor_output = F.relu( node_random_walk_model_feature_extractor(x)) # Spatial graph embedding model feature extractor if self.spatial_graph_embedding_model_name == 'GCN': spatial_graph_embedding_model = self.spatial_graph_embedding_models[ fold_id] if mode == 'train': spatial_graph_embedding_model.train() else: spatial_graph_embedding_model.eval() for i in range(self.n_spatial_graph_embedding_model_layer): layers_feature_extractor = spatial_graph_embedding_model.graph_convolution_layers[ i] if i == 0: layers_feature_extractor_output = F.relu( layers_feature_extractor((x_ori), (adj))) else: layers_feature_extractor_output = F.relu( layers_feature_extractor( (layers_feature_extractor_output), (adj))) if i != self.n_spatial_graph_embedding_model_layer - 1: layers_feature_extractor_output = F.dropout( layers_feature_extractor_output, p=self.dropout, training=self.training) x = readout_function(layers_feature_extractor_output, self.readout) spatial_graph_embedding_model_feature_extractor = torch.nn.Sequential( *list(spatial_graph_embedding_model.children())[:-1]) spatial_graph_embedding_model_feature_extractor_output = F.relu( spatial_graph_embedding_model_feature_extractor(x)) # Concat layer concat = torch.cat( (node_random_walk_model_feature_extractor_output, spatial_graph_embedding_model_feature_extractor_output), 1) # Linear combination ensemble layer x = F.dropout(concat, p=self.concat_dropout, training=self.training) if self.fc_layer_type == 'A': x = self.fc1(x) x = self.fc2(x) elif self.fc_layer_type == 'B': x = self.fc(x) return x
# monitor training loss torch.cuda.empty_cache() train_loss = 0.0 ################### # train the model # ################### for data, target in train_loader: if switch_ensemble is True: # print("Ensemble:", switch_ensemble) weight_fc1 = model.fc1.weight weight_fc2 = model.fc2.weight # print(weight) # print(F.dropout(weight, p=probability)*(1-probability)) model.fc1.weight = torch.nn.Parameter(F.dropout(weight_fc1, p=probability)*(1-probability)) model.fc2.weight = torch.nn.Parameter(F.dropout(weight_fc2, p=probability) * (1 - probability)) # linear layer (n_hidden -> hidden_2) # clear the gradients of all optimized variables optimizer.zero_grad() # forward pass: compute predicted outputs by passing inputs to the model output = model(data) if switch_ensemble is True: model.fc1.weight = torch.nn.Parameter(weight_fc1) model.fc2.weight = torch.nn.Parameter(weight_fc2) # calculate the loss loss = criterion(output, target) # backward pass: compute gradient of the loss with respect to model parameters loss.backward() # perform a single optimization step (parameter update) optimizer.step()
def forward(self, prev_output_tokens, encoder_out_dict): encoder_out = encoder_out_dict['encoder']['encoder_out'] trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed positions positions = self.embed_positions(prev_output_tokens) # embed tokens and positions x = self.embed_tokens(prev_output_tokens) + positions x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions avg_attn_scores = None for proj, conv, attention, selfattention, attproj in zip( self.projections, self.convolutions, self.attention, self.selfattention, self.attproj ): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=2) # attention if attention is not None: r = x x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b) x = x + r if not self.training and self.need_attn: if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) if selfattention is not None: x = selfattention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(0, 1) # project back to size of vocabulary x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) if not self.pretrained: x = self.fc3(x) # fusion gating if self.pretrained: trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out) y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1) gate1 = self.gate1(y) gate2 = self.gate2(y) gated_x1 = gate1 * x gated_x2 = gate2 * self.pretrained_outputs["out"] fusion = torch.cat([gated_x1, gated_x2], dim=-1) fusion = self.joining(fusion) fusion_output = self.fc3(fusion) return fusion_output, avg_attn_scores else: return x, avg_attn_scores
def extract_features( self, prev_output_tokens, encoder_out: Optional[EncoderOut] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = self.num_layers - 1 # embed positions positions = (self.embed_positions(prev_output_tokens, incremental_state=incremental_state) if self.embed_positions is not None else None) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask: Optional[Tensor] = None if self.cross_self_attention or prev_output_tokens.eq( self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn: Optional[Tensor] = None inner_states: List[Optional[Tensor]] = [x] for idx, layer in enumerate(self.layers): encoder_state: Optional[Tensor] = None if encoder_out is not None: if self.layer_wise_attention: encoder_states = encoder_out.encoder_states assert encoder_states is not None encoder_state = encoder_states[idx] else: encoder_state = encoder_out.encoder_out if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = torch.empty(1).uniform_() if not self.training or (dropout_probability > self.decoder_layerdrop): x, layer_attn, _ = layer( x, encoder_state, encoder_out.encoder_padding_mask if encoder_out is not None else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float() if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": [attn], "inner_states": inner_states}
def forward(self, x): out = self.linear(x) out = F.dropout(out, training=self.training) return out
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states}
def forward(self, x): new_features = super(_DenseLayer, self).forward(x) if self.drop_rate > 0: new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) return torch.cat([x, new_features], 1)
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): if encoder_out is not None: encoder_padding_mask = encoder_out['encoder_padding_mask'] encoder_out = encoder_out['encoder_out'] # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out( encoder_out, incremental_state) if self.embed_positions is not None: pos_embed = self.embed_positions(prev_output_tokens, incremental_state) else: pos_embed = 0 if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] x = self._embed_tokens(prev_output_tokens, incremental_state) # embed tokens and combine with positional embeddings x += pos_embed x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_if_training(x, incremental_state) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) residuals = [x] for proj, conv, attention, res_layer in zip(self.projections, self.convolutions, self.attention, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x, incremental_state) x = F.glu(x, dim=2) # attention if attention is not None: x = self._transpose_if_training(x, incremental_state) x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask) if not self.training and self.need_attn: attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_if_training(x, incremental_state) # residual if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = self._transpose_if_training(x, incremental_state) # project back to size of vocabulary if not using adaptive softmax if self.fc2 is not None and self.fc3 is not None: x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores