def forward(self, src_tokens, src_lengths, **kwargs): """ src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ B, T, _ = src_tokens.size() x = src_tokens.transpose( 1, 2).contiguous() # (B, feat, T) assuming C == 1 for layer_idx in range(len(self.conv_layers)): x = self.conv_layers[layer_idx](x) x = F.glu(x, dim=1) x = self.dropouts[layer_idx](x) x = x.transpose(1, 2).contiguous() # (B, T, 908) x = self.linear_layers[0](x) x = F.glu(x, dim=2) x = self.dropouts[-1](x) x = self.linear_layers[1](x) assert x.size(0) == B assert x.size(1) == T encoder_out = x.transpose(0, 1) # (T, B, vocab_size) # need to debug this -- find a simpler/elegant way in pytorch APIs encoder_padding_mask = (torch.arange(T).view( 1, T).expand(B, -1).to(x.device) >= src_lengths.view(B, 1).expand( -1, T)).t() # (B x T) -> (T x B) return { "encoder_out": encoder_out, # (T, B, vocab_size) "encoder_padding_mask": encoder_padding_mask, # (T, B) }
def forward(self, x): z1 = F.glu(self.fc1(x)) z2 = F.glu(self.fc2(z1)) out = self.fc3(z2) # z3 = F.relu(self.fc3(z2)) # out = self.fc4(z3) return out
def forward(self, g, features): x = F.leaky_relu(F.glu(self.layer1(g, features))) x = F.leaky_relu(F.glu(self.layer2(g, x))) x = F.leaky_relu(F.glu(self.layer3(g, x))) x = F.leaky_relu(F.glu(self.layer4(g, x))) x = self.layer5(g, x) return x
def forward(self, v, t): v = F.glu(self.norm1(v)) t = F.glu(self.norm2(t)) t = torch.sum(t, dim=1) att = v * t.unsqueeze(1).repeat(1, IM_K, 1) att = F.softmax(self.norm3(self.drop1(att)), 1) v = (att * v).sum(1) tv = self.drop2(t * v) tv = F.glu(self.norm4(tv)) return tv, att
def forward(self, x): x = F.relu(self.conv1(x)) x = self.res1(x) x = self.pool1(x) x = self.res2(x) x = self.pool2(x) x = self.res3(x) x = x.view(-1, 980) x = F.glu(self.dense1(x)) x = self.dropout1(x) x = F.glu(self.dense2(x)) x = self.dropout2(x) x = F.log_softmax(self.dense3(x)) return x
def forward(self, x): ''' :param x: (bz, seq_len, dim) :return: (bz, dim) ''' bs, seq_len, _ = x.size() # seq_range = torch.arange(0, seq_len, device=x.device, dtype=x.dtype) # pos_embed = self.pos_embedding(seq_range) seq_range = torch.arange(0, seq_len, device=x.device, dtype=torch.long).unsqueeze(0).repeat(bs, 1) pos_embed = self.pos_embedding(seq_range) x = x + pos_embed x = F.dropout(x, p=self.dropout, training=self.training) h = self.embed2hn(x) # (bz, dim, seq_len) conv_in = h.transpose(1, 2).contiguous() for conv in self.convs: conv_in = F.dropout(conv_in, p=self.dropout, training=self.training) conv_out = conv(conv_in) conv_out = F.glu(conv_out, dim=1) conv_in = (conv_in + conv_out) * self.scale # (bs, dim, seq_len) -> (bz, dim, 1) -> (bz, dim) conv_out = self.max_pool(conv_in).squeeze(-1) # conv_out = F.max_pool1d(conv_in, kernel_size=conv_in.size(-1)).squeeze(-1) return self.fc(conv_out)
def forward(self, src_tokens): src_lengths = src_tokens.shape[1] batch_size = src_tokens.shape[0] pos = self.pos_embedding( torch.arange(0, src_lengths).unsqueeze(0).repeat(batch_size, 1).to(self.device)) tok = self.tok_embedding( src_tokens) #tok = pos = [batch size, src len, emb dim] x = self.dropout(tok + pos) #x = [batch size, src len, emb dim] conv_input = self.emb2hid( x) #conv_input = [batch size, src len, hid dim] conv_input = conv_input.permute( 0, 2, 1) #conv_input = [batch size, hid dim, src len] for i, conv in enumerate(self.convs): conved = conv(self.dropout( conv_input)) #conved = [batch size, 2 * hid dim, src len] conved = F.glu(conved, dim=1) #conved = [batch size, hid dim, src len] conved = (conved + conv_input ) * self.scale #conved = [batch size, hid dim, src len] conv_input = conved conved = self.hid2emb(conved.permute( 0, 2, 1)) #conved = [batch size, src len, emb dim] combined = (conved + x) * self.scale #combined = [batch size, src len, emb dim] return conved, combined
def forward(self, x): """ Performs conditioning augmentation using the reparametrization trick. Parameters: x (torch.Tensor): vector of shape input_dim, can be used as the output of the text encoder Returns: condition_vector (torch.Tensor of shape condition_dim): the "sampled" version of the text encoding mu (torch.Tensor of shape condition_dim): mean of the text encoding logvar (torch.Tensor of shape condition_dim): variance of the text encoding """ # Compute the mean and stand deviation: pre_conditioning = F.glu(self.fc(x)) mu = pre_conditioning[:, :self.condition_dim] log_var = pre_conditioning[:, self.condition_dim:] std = torch.exp(log_var / 2) # std must be non-zero # reparameterization trick: # multiply the std by a normal distributed noise and add the mean # in order to sample the conditionining without disabling the backpropagation # (cannot propagate through a random node) epsilon = torch.randn_like(std) condition_vector = mu + epsilon * std # Return all of the results return condition_vector, mu, log_var
def forward(self, target, enc_attn, source_seq_out): # batch, seq_len_tgt, dim inputs = self.embedding(target) # batch, seq_len_tgt, hidden outputs = self.affine(inputs) for i in range(self.layers): # batch, hidden, seq_len_tgt outputs = outputs.permute(0, 2, 1) # batch, 2*hidden, seq_len_tgt outputs = self.conv(outputs) # This is the residual connection, # for the output of the conv will add kernel_size / 2 elements # before and after the origin input if i > 0: conv_out = conv_out + outputs # batch, hidden, seq_len_tgt outputs = F.glu(outputs, dim=1) # batch, seq_len_tgt, hidden outputs = outputs.transpose(1, 2) # A, B: batch, seq_len_tgt, hidden / 2 A, B = outputs.split(self.hidden_size, 2) # A2: batch * seq_len_tgt, hidden / 2 A2 = A.contiguous().view(-1, A.size(2)) # B2: batch * seq_len_tgt, hidden / 2 B2 = B.contiguous().view(-1, B.size(2)) # attn: batch * seq_len_tgt, hidden / 2 dec_attn = torch.mul(A2, self.softmax(B2)) # attn: batch * seq_len_tgt, hidden dec_attn2 = self.mapping(dec_attn) dec_attn2 = dec_attn2.view(A.size(0), A.size(1), -1) # enc_attn1: batch, seq_len_src, hidden_size enc_attn = enc_attn.view(A.size(0), -1, A.size(2)) # dec_attn1: batch, seq_len_tgt, hidden_size dec_attn = dec_attn.view(A.size(0), -1, A.size(2)) # attn_matrix: batch, seq_len_tgt, seq_len_src _attn_matrix = torch.bmm(dec_attn, enc_attn.transpose(1, 2)) attn_matrix = self.softmax( _attn_matrix.view(-1, _attn_matrix.size(2))) # normalized attn_matrix: batch, seq_len_tgt, seq_len_src attn_matrix = attn_matrix.view(_attn_matrix.size(0), _attn_matrix.size(1), -1) # attns: batch, seq_len_tgt, hidden_size attns = torch.bmm(attn_matrix, source_seq_out) # outpus: batch, seq_len_tgt, hidden_size outputs = dec_attn2 + attns # outpus: batch, seq_len_tgt, vocab_size outputs = F.log_softmax(self.fc(outputs)) return outputs
def forward(self, xs): """Forward pass. Args: xs (FloatTensor): `[B, T, d_model]` Returns: xs (FloatTensor): `[B, T, d_model]` """ bs, xmax, dim = xs.size() xs = xs.transpose(2, 1).contiguous() # `[B, C, T]` xs = self.pointwise_conv1(xs) # `[B, 2 * C, T]` xs = F.glu(xs, dim=1) # `[B, C, T]` xs = self.depthwise_conv(xs) # `[B, C, T]` if self.causal: xs = xs[:, :, :-self.padding] xs = xs.transpose(2, 1) if isinstance(self.norm, nn.LayerNorm): xs = self.activation(self.norm(xs)) # `[B, T, C]` else: # time-independent normalization xs = xs.contiguous().view(bs * xmax, -1, 1) xs = self.activation(self.norm(xs)) # `[B * T, C, 1]` xs = xs.view(bs, xmax, -1) xs = xs.transpose(2, 1) xs = self.pointwise_conv2(xs) # `[B, C, T]` xs = xs.transpose(2, 1).contiguous() # `[B, T, C]` return xs
def forward(self, x, pad_mask=None): x = F.glu(self.input_linear(x)) T, B, C = x.size() K, H = self.kernel_size, self.num_heads weight = self.weight_linear(x).view(T, B, H, K) if self.weight_softmax: weight = F.softmax(weight, dim=-1) weight = F.dropout(weight, p=self.weight_dropout, training=self.training) # [seq_len x batch_size x heads x kernel_size] -> [batch_size x heads x kernel_size x seq_len] weight = weight.permute(1, 2, 3, 0).contiguous() if pad_mask is not None: x = x.masked_fill(pad_mask, 0) x = x.permute(1, 2, 0).contiguous() x = dynamic_convolution(x, weight, self.padding_l).permute(2, 0, 1) if self.conv_bias is not None: x = x + self.conv_bias.view(1, 1, -1) x = self.output_linear(x) return x
def forward(self, input_tokens, encoder_out): # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed positions positions = self.embed_positions(input_tokens) if self._is_incremental_eval: # keep only the last token for incremental forward pass input_tokens = input_tokens[:, -1:] # embed tokens and positions x = self.embed_tokens(input_tokens) + positions x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_unless_incremental_eval(x) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = conv.remove_future_timesteps(x) x = F.glu(x, dim=2) # attention if attention is not None: x = self._transpose_unless_incremental_eval(x) x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b)) attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_unless_incremental_eval(x) # residual x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = self._transpose_unless_incremental_eval(x) # project back to size of vocabulary x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores
def forward(self, x, pad_mask=None): """ :param pad_mask: [seq_len x bsz] indicating which element is correct (this should be the same with the attention mask (pad=1, unpad=0) :param x: [seq_len x bsz x hidden_size] :return: """ x = x.transpose(0, 1).transpose(1, 2) # to [bsz x hidden_size x seq_len] # pointwise conv does not need to mask because its elementwise projection x = self.pointwise_conv1(x) x = F.glu(x, dim=1) if pad_mask is not None: pad_mask = pad_mask.transpose(0, 1).transpose(1, 2) # print(x.size(), pad_mask.size()) x = x.masked_fill_(pad_mask, 0) x = self.depthwise_conv(x) x = self.activation(x) x = self.pointwise_conv2(x) # x = F.conv1d(x, self.in_pointwise_weight, self.in_pointwise_bias, 1, 0, 1, 1) # x = F.glu(x, dim=1) # # x = F.conv1d(x, self.depthwise_weight, self.depthwise_bias, 1, self.padding, 1, self.groups) # x = self.activation(x) # # x = F.conv1d(x, self.out_pointwise_weight, self.out_pointwise_bias, 1, 0, 1, 1) x = x.transpose(1, 2).transpose(0, 1) # back to [seq_len x bsz x hidden_size] return x
def forward(self, xs): """Forward pass. Args: xs (FloatTensor): `[B, T, d_model]` Returns: xs (FloatTensor): `[B, T, d_model]` """ B, T, d_model = xs.size() assert d_model == self.d_model xs = xs.transpose(2, 1).contiguous() # `[B, C, T]` xs = self.pointwise_conv1(xs) # `[B, 2 * C, T]` xs = xs.transpose(2, 1) # `[B, T, 2 * C]` xs = F.glu(xs) # `[B, T, C]` xs = xs.transpose(2, 1).contiguous() # `[B, C, T]` xs = self.depthwise_conv(xs) # `[B, C, T]` xs = self.batch_norm(xs) xs = self.activation(xs) xs = self.pointwise_conv2(xs) # `[B, C, T]` xs = xs.transpose(2, 1).contiguous() # `[B, T, C]` return xs
def forward(self, x, mask=None, right_context=0): """ Args: x: [batch_size, time, channels] mask: [batch_size, time] """ if mask is not None: mask = mask.unsqueeze(2).repeat([1, 1, x.size(-1)]) x = self.pointwise_conv1(x) x = F.glu(x) if mask is not None: x.masked_fill_(~mask, 0.0) if right_context == 0: right_context = self.right_context x = F.pad(x, pad=(0, 0, self.kernel_size - right_context - 1, right_context), value=0.0).transpose(1, 2) x = self.depthwise_conv(x) x = self.batch_norm(x) x = x * torch.sigmoid(x) # swish x = x.transpose(1, 2) x = self.pointwise_conv2(x) if mask is not None: x.masked_fill_(~mask, 0.0) return x
def conv_cap(self, x, wordemb, imgsfeats): for i, conv in enumerate(self.convs): if (i == 0): x = x.transpose(2, 1) residual = self.resproj(x) residual = residual.transpose(2, 1) x = x.transpose(2, 1) else: residual = x x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = x[:, :, :-self.pad] x = F.glu(x, dim=1) if (self.is_attention and i % 2 == 0): attn = self.attention[int(i / 2)] x = x.transpose(2, 1) x = attn(x, wordemb, imgsfeats) x = F.relu(x.transpose(2, 1)) x = (x + residual) * math.sqrt(.5) return x
def _forward(self, x, is_incremental): """Forward Args: x: (B, in_channels, T) returns: (B, out_channels, T) """ residual = x x = F.dropout(x, p=self.dropout, training=self.training) if is_incremental: splitdim = -1 x = self.conv.incremental_forward(x) else: splitdim = 1 x = self.conv(x) # remove future time steps x = x[:, :, :residual.size(-1)] if self.causal else x if self.glu: x = F.glu(x, dim=splitdim) return (x + residual) * math.sqrt(0.5) else: a, b = x.split(x.size(splitdim) // 2, dim=splitdim) T = F.sigmoid(b) return (T * a + (1 - T) * residual)
def forward(self, tokens, positions): # embed tokens and positions x = self.embed_tokens(tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=-1) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = grad_multiply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return x, y
def forward(self, x, futur_mask=None, y=None): # [batch_size, seq_len, d_model] x_pad = x if self.only_see_past: x_pad = F.pad(x, (0, 0, self.kernel_size - 1, 0, 0, 0)) futur_mask = (torch.triu(torch.ones( x.size(1), x.size(1) if y is None else y.size(1)), diagonal=1) == 0).to(x.device) x_pad = x_pad.permute(0, 2, 1) conved = self.conv(x_pad) # [batch_size, 2 * hid_dim, seq_len] conved = F.glu(conved, dim=1) # [batch_size, hid_dim, seq_len] conved = conved.permute(0, 2, 1) conved = conved + x # residual connection if self.self_attn: if y is not None: self_attn = self.attn_norm(self.attn(conved, y, y)) else: self_attn = self.attn_norm( self.attn(conved, conved, conved, mask=futur_mask)) return self.feed_forward(conved + self_attn) return conved
def forward(self, x): batch_size = x.size()[0] x = self.linear(x) x = F.glu(x) x = x.reshape(batch_size, 1024, 4, 4) # 2x2 nearest neighbour upsampling x = F.interpolate(x, scale_factor=(2, 2), mode="nearest") x = F.glu(x, dim=1) x = F.interpolate(x, scale_factor=(2, 2), mode="nearest") x = self.conv2(x) x = F.glu(x, dim=1) x = F.interpolate(x, scale_factor=(2, 2), mode="nearest") x = F.glu(x, dim=1) x = self.last_conv(x) x = self.activ_out(x) return x
def forward(self, input): batch_size = input.shape[0] conv_input = self.emb2hid(embedded) conv_input = conv_input.permute( 0, 2, 1) # conv_input = [batch size, hidden_size, 1] conv_input = self.expand( conv_input) # conv_input = [batch size, hidden_size, output_size] batch_size = conv_input.shape[0] hid_dim = conv_input.shape[1] for i, conv in enumerate(self.convs): conv_input = self.dropout(conv_input) # zero padding padding = torch.zeros(batch_size, hidden_size, self.kernel_size - 1).fill_(0).to(device) padded_conv_input = torch.cat( (padding, conv_input), dim=2 ) #padded_conv_input = [batch size, hidden_size, num_seq+2 + kernel size - 1] # pass through convolutional layer conved = conv(padded_conv_input ) #conved = [batch size, 2 * hidden_size, num_seq+2] conved = F.glu( conved, dim=1) #conved = [batch size, hidden_size, num_seq+2] # apply residual connection conved = (conved + conv_input) * self.scale conv_input = conved output = self.fc_out(self.dropout(conved.permute( 0, 2, 1))) #output = [batch size, self.output_size, self.exp_size] return output.permute(0, 2, 1)
def forward(self, x): """ :param x: [seq_len x bsz x hidden_size] :return: """ x = x.transpose(0, 1).transpose(1, 2) # to [bsz x hidden_size x seq_len] x = self.pointwise_conv1(x) x = F.glu(x, dim=1) x = self.depthwise_conv(x) x = self.activation(self.norm(x)) x = self.pointwise_conv2(x) # x = F.conv1d(x, self.in_pointwise_weight, self.in_pointwise_bias, 1, 0, 1, 1) # x = F.glu(x, dim=1) # # x = F.conv1d(x, self.depthwise_weight, self.depthwise_bias, 1, self.padding, 1, self.groups) # x = self.activation(x) # # x = F.conv1d(x, self.out_pointwise_weight, self.out_pointwise_bias, 1, 0, 1, 1) x = x.transpose(1, 2).transpose( 0, 1) # back to [seq_len x bsz x hidden_size] return x
def forward(self, protein): #protein = [batch size, protein len,protein_dim] conv_input = self.fc(protein) # conv_input=[batch size,protein len,hid dim] #permute for convolutional layer conv_input = conv_input.permute(0, 2, 1) #conv_input = [batch size, hid dim, protein len] for i, conv in enumerate(self.convs): #pass through convolutional layer conved = conv(self.dropout(conv_input)) #conved = [batch size, 2*hid dim, protein len] #pass through GLU activation function conved = F.glu(conved, dim=1) #conved = [batch size, hid dim, protein len] #apply residual connection / high way conved = (conved + conv_input) * self.scale #conved = [batch size, hid dim, protein len] #set conv_input to conved for next loop iteration conv_input = conved conved = conved.permute(0, 2, 1) # conved = [batch size,protein len,hid dim] conved = self.ln(conved) return conved
def forward(self, imgsfeats, imgsfc7, wordclass): attn_buffer = None wordemb = self.emb_0(wordclass) # wordemb = self.emb_1(wordemb) x = wordemb.transpose(2, 1) # (100L, 512L, 15L) # print 'embedding:', x.size() # (100L, 512L, 15L) batchsize, wordembdim, maxtokens = x.size() y = F.relu(self.imgproj(imgsfc7)) y = y.unsqueeze(2).expand(batchsize, self.nfeats, maxtokens) # print 'img:', y.size() # (100L, 512L, 15L) x = torch.cat([x, y], 1) # (100L, 1024L, 15L) # print 'concat emb and img:', x.size() # (100L, 1024L, 15L) for i, conv in enumerate(self.convs): if(i == 0): # print x.size() # (100L, 1024L, 15L) x = x.transpose(2, 1) residual = self.resproj(x) # print x.size() # (100L, 15L, 1024L) # print residual.size() # (100L, 15L, 1024L) residual = residual.transpose(2, 1) # print residual.size() # (100L, 512L, 15L) x = x.transpose(2, 1) else: residual = x x = F.dropout(x, p=self.dropout, training=self.training) # print 'layer:', i # print x.size() # (100L, 1024L, 15L) in layer 0, (100L, 512L, 15L) in layer 1 and layer 2 x = conv(x) # print x.size() # (100L, 1024L, 19L) x = x[:,:,:-self.pad] # print x.size() # (100L, 1024L, 15L) # print 'before glu:', x.size() # (100L, 1024L, 15L) x = F.glu(x, dim=1) # print 'after glu:', x.size() # (100L, 512L, 15L) if(self.is_attention): attn = self.attention[i] x = x.transpose(2, 1) x, attn_buffer = attn(x, wordemb, imgsfeats) x = x.transpose(2, 1) x = (x+residual) # *math.sqrt(.5) # print 'add res', x.size() # (100L, 512L, 15L) x = x.transpose(2, 1) # print 'out of conv', x.size() # (100L, 15L, 512L) x = self.classifier(x) # print 'classisify:', x.size() # (100L, 9221L, 15L) x = x.transpose(2, 1) # print 'return:', x.size() # (100L, 1024L, 15L) return x, attn_buffer
def forward(self, input): residual = input output = self.residual_layer_1(input) output = F.glu(output, dim=1) output = self.residual_layer_2(output) output += residual return output
def forward(self, xs, xlens): """Forward computation. Args: xs (FloatTensor): `[B, T, input_dim (+Δ, ΔΔ)]` xlens (list): A list of length `[B]` Returns: xs (FloatTensor): `[B, T', out_ch * feat_dim]` xlens (list): A list of length `[B]` """ bs, time, input_dim = xs.size() xs = xs.transpose(2, 1).unsqueeze(3) # `[B, in_ch (input_dim), T, 1]` xs = self.layers(xs) # `[B, out_ch, T, 1]` bs, out_ch, time, freq = xs.size() xs = xs.transpose(2, 1).contiguous().view( bs, time, -1) # `[B, T, out_ch * feat_dim]` # weight normalization + GLU for the last fully-connected layer xs = F.glu(self.fc_glu(xs), dim=2) # Bridge layer if self.bridge is not None: xs = self.bridge(xs) # NOTE: no subsampling is conducted return xs, xlens
def forward(self, trg, encoder_conved, encoder_combined): """ Get output and attention :param trg: trg = [batch size, trg len] :param encoder_conved: encoder_conved = encoder_combined = [batch size, src len, emb dim] :param encoder_combined: :return: """ batch_size = trg.shape[0] trg_len = trg.shape[1] # create position tensor pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) # pos = [batch size, trg len] # embed tokens and positions tok_embedded = self.tok_embedding(trg) pos_embedded = self.pos_embedding(pos) # tok_embedded = [batch size, trg len, emb dim] # pos_embedded = [batch size, trg len, emb dim] # combine embeddings by elementwise summing embedded = self.dropout(tok_embedded + pos_embedded) # embedded = [batch size, trg len, emb dim] # pass embedded through linear layer to go through emb dim -> hid dim conv_input = self.emb2hid(embedded) # conv_input = [batch size, trg len, hid dim] # permute for convolutional layer conv_input = conv_input.permute(0, 2, 1) # conv_input = [batch size, hid dim, trg len] batch_size = conv_input.shape[0] hid_dim = conv_input.shape[1] for i, conv in enumerate(self.convs): # apply dropout conv_input = self.dropout(conv_input) # need to pad so decoder can't "cheat" padding = torch.zeros(batch_size, hid_dim, self.kernel_size - 1).fill_(self.trg_pad_idx).to(self.device) padded_conv_input = torch.cat((padding, conv_input), dim=2) # padded_conv_input = [batch size, hid dim, trg len + kernel size - 1] # pass through convolutional layer conved = conv(padded_conv_input) # conved = [batch size, 2 * hid dim, trg len] # pass through GLU activation function conved = F.glu(conved, dim=1) # conved = [batch size, hid dim, trg len] # calculate attention attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined) # attention = [batch size, trg len, src len] # apply residual connection conved = (conved + conv_input) * self.scale # conved = [batch size, hid dim, trg len] # set conv_input to conved for next loop iteration conv_input = conved conved = self.hid2emb(conved.permute(0, 2, 1)) # conved = [batch size, trg len, emb dim] output = self.fc_out(self.dropout(conved)) # output = [batch size, trg len, output dim] return output, attention
def forward(self, protein): #pos = torch.arange(0, protein.shape[1]).unsqueeze(0).repeat(protein.shape[0], 1).to(self.device) #protein = protein + self.pos_embedding(pos) #protein = [batch size, protein len,protein_dim] conv_input = self.fc(protein) # conv_input=[batch size,protein len,hid dim] #permute for convolutional layer conv_input = conv_input.permute(0, 2, 1) #conv_input = [batch size, hid dim, protein len] for i, conv in enumerate(self.convs): #pass through convolutional layer conved = conv(self.dropout(conv_input)) #conved = [batch size, 2*hid dim, protein len] #pass through GLU activation function conved = F.glu(conved, dim=1) #conved = [batch size, hid dim, protein len] #apply residual connection / high way conved = (conved + conv_input) * self.scale #conved = [batch size, hid dim, protein len] #set conv_input to conved for next loop iteration conv_input = conved conved = conved.permute(0, 2, 1) # conved = [batch size,protein len,hid dim] return conved
def forward(self, xs): """Forward pass. Args: xs (FloatTensor): `[B, T, d_model]` Returns: xs (FloatTensor): `[B, T, d_model]` """ xs = xs.transpose(2, 1).contiguous() # `[B, C, T]` if self.causal is not None: xs = self.causal(xs) xs = xs[:, :, :-(self.kernel_size - 1)] xs = self.pointwise_conv1(xs) # `[B, 2 * C, T]` xs = xs.transpose(2, 1) # `[B, T, 2 * C]` xs = F.glu(xs) # `[B, T, C]` xs = xs.transpose(2, 1).contiguous() # `[B, C, T]` xs = self.depthwise_conv(xs) # `[B, C, T]` xs = self.norm(xs) xs = self.activation(xs) xs = self.pointwise_conv2(xs) # `[B, C, T]` xs = xs.transpose(2, 1).contiguous() # `[B, T, C]` return xs
def forward(self, src): """ src: [B, L] """ B, L = src.shape mask = (src != PAD_idx).long() # padded pos_tokens pos_tokens = mask.cumsum(dim=1) * mask emb = self.embedding(src) # [B, L, emb_dim] pos_emb = self.pos_embedding(pos_tokens) # [B, L, emb_dim] emb = self.dropout(emb + pos_emb) out = self.emb2hid(emb).permute(0, 2, 1) # [B, h_dim, L] for conv in self.convs: skip_con = out out = out.masked_fill(mask.unsqueeze(1) == 0, 0.) out = conv(self.dropout(out)) # [B, h_dim*2, L] out = F.glu(out, dim=1) # [B, h_dim, L] # residual connection out = (out + skip_con) * self.scale # [B, h_dim, L] out = self.hid2emb(out.permute(0, 2, 1)) # encoder out z. [B, L, emb_dim] out = GradMultiply.apply(out, 1.0 / (2.0 * self.n_layers)) attn_value = (out + emb) * self.scale # attention value (z+e) return out, attn_value, mask
def forward(self,x): x1 = self.fc(x) if self.add_batch_norm: x1 = self.batch_norm(x1) x = th.cat((x, x1), 1) return F.glu(x,1)
def _forward(self, input_tokens, positions, encoder_out): # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed tokens and positions x = self.embed_tokens(input_tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_unless_incremental_eval(x) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = conv.remove_future_timesteps(x) x = F.glu(x) # attention if attention is not None: x = self._transpose_unless_incremental_eval(x) x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b)) attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_unless_incremental_eval(x) # residual x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = self._transpose_unless_incremental_eval(x) # project back to size of vocabulary x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.fc3(x) return x, avg_attn_scores
def forward(self, src_tokens, src_lengths): # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if attention is not None: x = attention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5) return { 'encoder_out': (x, y), }
def forward(self, src_tokens): positions = Variable(make_positions(src_tokens.data, self.dictionary.pad(), left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)) # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=-1) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return x, y
def forward(self, prev_output_tokens, encoder_out_dict): encoder_out = encoder_out_dict['encoder']['encoder_out'] trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed positions positions = self.embed_positions(prev_output_tokens) # embed tokens and positions x = self.embed_tokens(prev_output_tokens) + positions x = F.dropout(x, p=self.dropout, training=self.training) target_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions avg_attn_scores = None for proj, conv, attention, selfattention, attproj in zip( self.projections, self.convolutions, self.attention, self.selfattention, self.attproj ): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=2) # attention if attention is not None: r = x x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b) x = x + r if not self.training and self.need_attn: if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) if selfattention is not None: x = selfattention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(0, 1) # project back to size of vocabulary x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) if not self.pretrained: x = self.fc3(x) # fusion gating if self.pretrained: trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out) y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1) gate1 = self.gate1(y) gate2 = self.gate2(y) gated_x1 = gate1 * x gated_x2 = gate2 * self.pretrained_outputs["out"] fusion = torch.cat([gated_x1, gated_x2], dim=-1) fusion = self.joining(fusion) fusion_output = self.fc3(fusion) return fusion_output, avg_attn_scores else: return x, avg_attn_scores