Example #1
0
    def forward(self, src_tokens, src_lengths, **kwargs):
        """
        src_tokens: padded tensor (B, T, C * feat)
        src_lengths: tensor of original lengths of input utterances (B,)
        """
        B, T, _ = src_tokens.size()
        x = src_tokens.transpose(
            1, 2).contiguous()  # (B, feat, T) assuming C == 1

        for layer_idx in range(len(self.conv_layers)):
            x = self.conv_layers[layer_idx](x)
            x = F.glu(x, dim=1)
            x = self.dropouts[layer_idx](x)

        x = x.transpose(1, 2).contiguous()  # (B, T, 908)
        x = self.linear_layers[0](x)
        x = F.glu(x, dim=2)
        x = self.dropouts[-1](x)
        x = self.linear_layers[1](x)

        assert x.size(0) == B
        assert x.size(1) == T

        encoder_out = x.transpose(0, 1)  # (T, B, vocab_size)

        # need to debug this -- find a simpler/elegant way in pytorch APIs
        encoder_padding_mask = (torch.arange(T).view(
            1, T).expand(B, -1).to(x.device) >= src_lengths.view(B, 1).expand(
                -1, T)).t()  # (B x T) -> (T x B)

        return {
            "encoder_out": encoder_out,  # (T, B, vocab_size)
            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
        }
Example #2
0
 def forward(self, x):
     z1 = F.glu(self.fc1(x))
     z2 = F.glu(self.fc2(z1))
     out = self.fc3(z2)
     #         z3 = F.relu(self.fc3(z2))
     #         out = self.fc4(z3)
     return out
Example #3
0
    def forward(self, g, features):
        x = F.leaky_relu(F.glu(self.layer1(g, features)))
        x = F.leaky_relu(F.glu(self.layer2(g, x)))
        x = F.leaky_relu(F.glu(self.layer3(g, x)))
        x = F.leaky_relu(F.glu(self.layer4(g, x)))
        x = self.layer5(g, x)

        return x
 def forward(self, v, t):
     v = F.glu(self.norm1(v))
     t = F.glu(self.norm2(t))
     t = torch.sum(t, dim=1)
     att = v * t.unsqueeze(1).repeat(1, IM_K, 1)
     att = F.softmax(self.norm3(self.drop1(att)), 1)
     v = (att * v).sum(1)
     tv = self.drop2(t * v)
     tv = F.glu(self.norm4(tv))
     return tv, att
Example #5
0
 def forward(self, x):
     x = F.relu(self.conv1(x))
     x = self.res1(x)
     x = self.pool1(x)
     x = self.res2(x)
     x = self.pool2(x)
     x = self.res3(x)
     x = x.view(-1, 980)
     x = F.glu(self.dense1(x))
     x = self.dropout1(x)
     x = F.glu(self.dense2(x))
     x = self.dropout2(x)
     x = F.log_softmax(self.dense3(x))
     return x
Example #6
0
    def forward(self, x):
        '''
        :param x: (bz, seq_len, dim)
        :return: (bz, dim)
        '''

        bs, seq_len, _ = x.size()
        # seq_range = torch.arange(0, seq_len, device=x.device, dtype=x.dtype)
        # pos_embed = self.pos_embedding(seq_range)
        seq_range = torch.arange(0, seq_len, device=x.device, dtype=torch.long).unsqueeze(0).repeat(bs, 1)
        pos_embed = self.pos_embedding(seq_range)
        x = x + pos_embed
        x = F.dropout(x, p=self.dropout, training=self.training)
        h = self.embed2hn(x)
        # (bz, dim, seq_len)
        conv_in = h.transpose(1, 2).contiguous()
        for conv in self.convs:
            conv_in = F.dropout(conv_in, p=self.dropout, training=self.training)
            conv_out = conv(conv_in)
            conv_out = F.glu(conv_out, dim=1)
            conv_in = (conv_in + conv_out) * self.scale

        # (bs, dim, seq_len) -> (bz, dim, 1) -> (bz, dim)
        conv_out = self.max_pool(conv_in).squeeze(-1)
        # conv_out = F.max_pool1d(conv_in, kernel_size=conv_in.size(-1)).squeeze(-1)
        return self.fc(conv_out)
Example #7
0
    def forward(self, src_tokens):

        src_lengths = src_tokens.shape[1]
        batch_size = src_tokens.shape[0]

        pos = self.pos_embedding(
            torch.arange(0,
                         src_lengths).unsqueeze(0).repeat(batch_size,
                                                          1).to(self.device))
        tok = self.tok_embedding(
            src_tokens)  #tok = pos = [batch size, src len, emb dim]

        x = self.dropout(tok + pos)  #x = [batch size, src len, emb dim]

        conv_input = self.emb2hid(
            x)  #conv_input = [batch size, src len, hid dim]
        conv_input = conv_input.permute(
            0, 2, 1)  #conv_input = [batch size, hid dim, src len]

        for i, conv in enumerate(self.convs):
            conved = conv(self.dropout(
                conv_input))  #conved = [batch size, 2 * hid dim, src len]
            conved = F.glu(conved,
                           dim=1)  #conved = [batch size, hid dim, src len]
            conved = (conved + conv_input
                      ) * self.scale  #conved = [batch size, hid dim, src len]
            conv_input = conved

        conved = self.hid2emb(conved.permute(
            0, 2, 1))  #conved = [batch size, src len, emb dim]
        combined = (conved +
                    x) * self.scale  #combined = [batch size, src len, emb dim]

        return conved, combined
    def forward(self, x):
        """ Performs conditioning augmentation using the reparametrization trick.
        Parameters:
            x (torch.Tensor): vector of shape input_dim, can be used as the output of the text encoder

        Returns:
            condition_vector (torch.Tensor of shape condition_dim):
                the "sampled" version of the text encoding
            mu (torch.Tensor of shape condition_dim): mean of the text encoding
            logvar (torch.Tensor of shape condition_dim): variance of the text encoding
        """
        # Compute the mean and stand deviation:
        pre_conditioning = F.glu(self.fc(x))
        mu = pre_conditioning[:, :self.condition_dim]
        log_var = pre_conditioning[:, self.condition_dim:]
        std = torch.exp(log_var / 2)  # std must be non-zero

        # reparameterization trick:
        # multiply the std by a normal distributed noise and add the mean
        # in order to sample the conditionining without disabling the backpropagation
        # (cannot propagate through a random node)
        epsilon = torch.randn_like(std)
        condition_vector = mu + epsilon * std

        # Return all of the results
        return condition_vector, mu, log_var
Example #9
0
    def forward(self, target, enc_attn, source_seq_out):
        # batch, seq_len_tgt, dim
        inputs = self.embedding(target)
        # batch, seq_len_tgt, hidden
        outputs = self.affine(inputs)

        for i in range(self.layers):
            # batch, hidden, seq_len_tgt
            outputs = outputs.permute(0, 2, 1)
            # batch, 2*hidden, seq_len_tgt
            outputs = self.conv(outputs)

            # This is the residual connection,
            # for the output of the conv will add kernel_size / 2 elements
            # before and after the origin input
            if i > 0:
                conv_out = conv_out + outputs

            # batch, hidden, seq_len_tgt
            outputs = F.glu(outputs, dim=1)

            # batch, seq_len_tgt, hidden
            outputs = outputs.transpose(1, 2)
            # A, B: batch, seq_len_tgt, hidden / 2
            A, B = outputs.split(self.hidden_size, 2)
            # A2: batch * seq_len_tgt, hidden / 2
            A2 = A.contiguous().view(-1, A.size(2))
            # B2: batch * seq_len_tgt, hidden / 2
            B2 = B.contiguous().view(-1, B.size(2))
            # attn: batch * seq_len_tgt, hidden / 2
            dec_attn = torch.mul(A2, self.softmax(B2))

            # attn: batch * seq_len_tgt, hidden
            dec_attn2 = self.mapping(dec_attn)
            dec_attn2 = dec_attn2.view(A.size(0), A.size(1), -1)

            # enc_attn1: batch, seq_len_src, hidden_size
            enc_attn = enc_attn.view(A.size(0), -1, A.size(2))
            # dec_attn1: batch, seq_len_tgt, hidden_size
            dec_attn = dec_attn.view(A.size(0), -1, A.size(2))

            # attn_matrix: batch, seq_len_tgt, seq_len_src
            _attn_matrix = torch.bmm(dec_attn, enc_attn.transpose(1, 2))
            attn_matrix = self.softmax(
                _attn_matrix.view(-1, _attn_matrix.size(2)))

            # normalized attn_matrix: batch, seq_len_tgt, seq_len_src
            attn_matrix = attn_matrix.view(_attn_matrix.size(0),
                                           _attn_matrix.size(1), -1)

            # attns: batch, seq_len_tgt, hidden_size
            attns = torch.bmm(attn_matrix, source_seq_out)

            # outpus: batch, seq_len_tgt, hidden_size
            outputs = dec_attn2 + attns

        # outpus: batch, seq_len_tgt, vocab_size
        outputs = F.log_softmax(self.fc(outputs))

        return outputs
    def forward(self, xs):
        """Forward pass.

        Args:
            xs (FloatTensor): `[B, T, d_model]`
        Returns:
            xs (FloatTensor): `[B, T, d_model]`

        """
        bs, xmax, dim = xs.size()

        xs = xs.transpose(2, 1).contiguous()  # `[B, C, T]`
        xs = self.pointwise_conv1(xs)  # `[B, 2 * C, T]`
        xs = F.glu(xs, dim=1)  # `[B, C, T]`

        xs = self.depthwise_conv(xs)  # `[B, C, T]`
        if self.causal:
            xs = xs[:, :, :-self.padding]

        xs = xs.transpose(2, 1)
        if isinstance(self.norm, nn.LayerNorm):
            xs = self.activation(self.norm(xs))  # `[B, T, C]`
        else:
            # time-independent normalization
            xs = xs.contiguous().view(bs * xmax, -1, 1)
            xs = self.activation(self.norm(xs))  # `[B * T, C, 1]`
            xs = xs.view(bs, xmax, -1)
        xs = xs.transpose(2, 1)
        xs = self.pointwise_conv2(xs)  # `[B, C, T]`

        xs = xs.transpose(2, 1).contiguous()  # `[B, T, C]`
        return xs
Example #11
0
    def forward(self, x, pad_mask=None):

        x = F.glu(self.input_linear(x))

        T, B, C = x.size()

        K, H = self.kernel_size, self.num_heads

        weight = self.weight_linear(x).view(T, B, H, K)

        if self.weight_softmax:
            weight = F.softmax(weight, dim=-1)

        weight = F.dropout(weight,
                           p=self.weight_dropout,
                           training=self.training)

        # [seq_len x batch_size x heads x kernel_size] -> [batch_size x heads x kernel_size x seq_len]
        weight = weight.permute(1, 2, 3, 0).contiguous()

        if pad_mask is not None:
            x = x.masked_fill(pad_mask, 0)

        x = x.permute(1, 2, 0).contiguous()
        x = dynamic_convolution(x, weight, self.padding_l).permute(2, 0, 1)

        if self.conv_bias is not None:
            x = x + self.conv_bias.view(1, 1, -1)

        x = self.output_linear(x)

        return x
Example #12
0
    def forward(self, input_tokens, encoder_out):
        # split and transpose encoder outputs
        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed positions
        positions = self.embed_positions(input_tokens)

        if self._is_incremental_eval:
            # keep only the last token for incremental forward pass
            input_tokens = input_tokens[:, -1:]

        # embed tokens and positions
        x = self.embed_tokens(input_tokens) + positions
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = self._transpose_unless_incremental_eval(x)

        # temporal convolutions
        avg_attn_scores = None
        num_attn_layers = len(self.attention)
        for proj, conv, attention in zip(self.projections, self.convolutions,
                                         self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = conv.remove_future_timesteps(x)
            x = F.glu(x, dim=2)

            # attention
            if attention is not None:
                x = self._transpose_unless_incremental_eval(x)

                x, attn_scores = attention(x, target_embedding,
                                           (encoder_a, encoder_b))
                attn_scores = attn_scores / num_attn_layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)

                x = self._transpose_unless_incremental_eval(x)

            # residual
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = self._transpose_unless_incremental_eval(x)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc3(x)

        return x, avg_attn_scores
Example #13
0
    def forward(self, x, pad_mask=None):
        """
        :param pad_mask: [seq_len x bsz] indicating which element is correct
        (this should be the same with the attention mask (pad=1, unpad=0)
        :param x: [seq_len x bsz x hidden_size]
        :return:
        """

        x = x.transpose(0, 1).transpose(1, 2)  # to [bsz x hidden_size x seq_len]

        # pointwise conv does not need to mask because its elementwise projection
        x = self.pointwise_conv1(x)
        x = F.glu(x, dim=1)

        if pad_mask is not None:
            pad_mask = pad_mask.transpose(0, 1).transpose(1, 2)
            # print(x.size(), pad_mask.size())
            x = x.masked_fill_(pad_mask, 0)
        x = self.depthwise_conv(x)
        x = self.activation(x)

        x = self.pointwise_conv2(x)

        # x = F.conv1d(x, self.in_pointwise_weight, self.in_pointwise_bias, 1, 0, 1, 1)
        # x = F.glu(x, dim=1)
        #
        # x = F.conv1d(x, self.depthwise_weight, self.depthwise_bias, 1, self.padding, 1, self.groups)
        # x = self.activation(x)
        #
        # x = F.conv1d(x, self.out_pointwise_weight, self.out_pointwise_bias, 1, 0, 1, 1)

        x = x.transpose(1, 2).transpose(0, 1)  # back to [seq_len x bsz x hidden_size]

        return x
    def forward(self, xs):
        """Forward pass.

        Args:
            xs (FloatTensor): `[B, T, d_model]`
        Returns:
            xs (FloatTensor): `[B, T, d_model]`

        """
        B, T, d_model = xs.size()
        assert d_model == self.d_model

        xs = xs.transpose(2, 1).contiguous()  # `[B, C, T]`
        xs = self.pointwise_conv1(xs)  # `[B, 2 * C, T]`
        xs = xs.transpose(2, 1)  # `[B, T, 2 * C]`
        xs = F.glu(xs)  # `[B, T, C]`
        xs = xs.transpose(2, 1).contiguous()  # `[B, C, T]`
        xs = self.depthwise_conv(xs)  # `[B, C, T]`

        xs = self.batch_norm(xs)
        xs = self.activation(xs)
        xs = self.pointwise_conv2(xs)  # `[B, C, T]`

        xs = xs.transpose(2, 1).contiguous()  # `[B, T, C]`
        return xs
Example #15
0
    def forward(self, x, mask=None, right_context=0):
        """
        Args:
            x: [batch_size, time, channels]
            mask: [batch_size, time]
        """
        if mask is not None:
            mask = mask.unsqueeze(2).repeat([1, 1, x.size(-1)])

        x = self.pointwise_conv1(x)
        x = F.glu(x)
        if mask is not None:
            x.masked_fill_(~mask, 0.0)

        if right_context == 0:
            right_context = self.right_context
        x = F.pad(x,
                  pad=(0, 0, self.kernel_size - right_context - 1,
                       right_context),
                  value=0.0).transpose(1, 2)
        x = self.depthwise_conv(x)
        x = self.batch_norm(x)
        x = x * torch.sigmoid(x)  # swish
        x = x.transpose(1, 2)

        x = self.pointwise_conv2(x)
        if mask is not None:
            x.masked_fill_(~mask, 0.0)

        return x
Example #16
0
    def conv_cap(self, x, wordemb, imgsfeats):
        for i, conv in enumerate(self.convs):

            if (i == 0):
                x = x.transpose(2, 1)
                residual = self.resproj(x)
                residual = residual.transpose(2, 1)
                x = x.transpose(2, 1)
            else:
                residual = x

            x = F.dropout(x, p=self.dropout, training=self.training)

            x = conv(x)
            x = x[:, :, :-self.pad]

            x = F.glu(x, dim=1)

            if (self.is_attention and i % 2 == 0):
                attn = self.attention[int(i / 2)]
                x = x.transpose(2, 1)
                x = attn(x, wordemb, imgsfeats)
                x = F.relu(x.transpose(2, 1))

            x = (x + residual) * math.sqrt(.5)
        return x
Example #17
0
    def _forward(self, x, is_incremental):
        """Forward

        Args:
            x: (B, in_channels, T)
        returns:
            (B, out_channels, T)
        """

        residual = x
        x = F.dropout(x, p=self.dropout, training=self.training)
        if is_incremental:
            splitdim = -1
            x = self.conv.incremental_forward(x)
        else:
            splitdim = 1
            x = self.conv(x)
            # remove future time steps
            x = x[:, :, :residual.size(-1)] if self.causal else x

        if self.glu:
            x = F.glu(x, dim=splitdim)
            return (x + residual) * math.sqrt(0.5)
        else:
            a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
            T = F.sigmoid(b)
            return (T * a + (1 - T) * residual)
Example #18
0
    def forward(self, tokens, positions):
        # embed tokens and positions
        x = self.embed_tokens(tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=-1)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = grad_multiply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return x, y
Example #19
0
    def forward(self,
                x,
                futur_mask=None,
                y=None):  # [batch_size, seq_len, d_model]
        x_pad = x
        if self.only_see_past:
            x_pad = F.pad(x, (0, 0, self.kernel_size - 1, 0, 0, 0))
            futur_mask = (torch.triu(torch.ones(
                x.size(1),
                x.size(1) if y is None else y.size(1)),
                                     diagonal=1) == 0).to(x.device)

        x_pad = x_pad.permute(0, 2, 1)
        conved = self.conv(x_pad)  # [batch_size, 2 * hid_dim, seq_len]
        conved = F.glu(conved, dim=1)  # [batch_size, hid_dim, seq_len]
        conved = conved.permute(0, 2, 1)
        conved = conved + x  # residual connection

        if self.self_attn:
            if y is not None:
                self_attn = self.attn_norm(self.attn(conved, y, y))
            else:
                self_attn = self.attn_norm(
                    self.attn(conved, conved, conved, mask=futur_mask))

            return self.feed_forward(conved + self_attn)
        return conved
Example #20
0
 def forward(self, x):
     batch_size = x.size()[0]
     x = self.linear(x)
     x = F.glu(x)
     x = x.reshape(batch_size, 1024, 4, 4)
     # 2x2 nearest neighbour upsampling
     x = F.interpolate(x, scale_factor=(2, 2), mode="nearest")
     x = F.glu(x, dim=1)
     x = F.interpolate(x, scale_factor=(2, 2), mode="nearest")
     x = self.conv2(x)
     x = F.glu(x, dim=1)
     x = F.interpolate(x, scale_factor=(2, 2), mode="nearest")
     x = F.glu(x, dim=1)
     x = self.last_conv(x)
     x = self.activ_out(x)
     return x
Example #21
0
    def forward(self, input):
        batch_size = input.shape[0]
        conv_input = self.emb2hid(embedded)
        conv_input = conv_input.permute(
            0, 2, 1)  # conv_input = [batch size, hidden_size, 1]
        conv_input = self.expand(
            conv_input)  # conv_input = [batch size, hidden_size, output_size]

        batch_size = conv_input.shape[0]
        hid_dim = conv_input.shape[1]

        for i, conv in enumerate(self.convs):
            conv_input = self.dropout(conv_input)

            # zero padding
            padding = torch.zeros(batch_size, hidden_size,
                                  self.kernel_size - 1).fill_(0).to(device)
            padded_conv_input = torch.cat(
                (padding, conv_input), dim=2
            )  #padded_conv_input = [batch size, hidden_size, num_seq+2 + kernel size - 1]

            # pass through convolutional layer
            conved = conv(padded_conv_input
                          )  #conved = [batch size, 2 * hidden_size, num_seq+2]
            conved = F.glu(
                conved, dim=1)  #conved = [batch size, hidden_size, num_seq+2]

            # apply residual connection
            conved = (conved + conv_input) * self.scale
            conv_input = conved

        output = self.fc_out(self.dropout(conved.permute(
            0, 2, 1)))  #output = [batch size, self.output_size, self.exp_size]
        return output.permute(0, 2, 1)
Example #22
0
    def forward(self, x):
        """
        :param x: [seq_len x bsz x hidden_size]
        :return:
        """

        x = x.transpose(0, 1).transpose(1,
                                        2)  # to [bsz x hidden_size x seq_len]

        x = self.pointwise_conv1(x)
        x = F.glu(x, dim=1)

        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

        x = self.pointwise_conv2(x)

        # x = F.conv1d(x, self.in_pointwise_weight, self.in_pointwise_bias, 1, 0, 1, 1)
        # x = F.glu(x, dim=1)
        #
        # x = F.conv1d(x, self.depthwise_weight, self.depthwise_bias, 1, self.padding, 1, self.groups)
        # x = self.activation(x)
        #
        # x = F.conv1d(x, self.out_pointwise_weight, self.out_pointwise_bias, 1, 0, 1, 1)

        x = x.transpose(1, 2).transpose(
            0, 1)  # back to [seq_len x bsz x hidden_size]

        return x
Example #23
0
    def forward(self, protein):
        #protein = [batch size, protein len,protein_dim]
        conv_input = self.fc(protein)
        # conv_input=[batch size,protein len,hid dim]
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1)
        #conv_input = [batch size, hid dim, protein len]
        for i, conv in enumerate(self.convs):
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))
            #conved = [batch size, 2*hid dim, protein len]

            #pass through GLU activation function
            conved = F.glu(conved, dim=1)
            #conved = [batch size, hid dim, protein len]

            #apply residual connection / high way
            conved = (conved + conv_input) * self.scale
            #conved = [batch size, hid dim, protein len]

            #set conv_input to conved for next loop iteration
            conv_input = conved

        conved = conved.permute(0, 2, 1)
        # conved = [batch size,protein len,hid dim]
        conved = self.ln(conved)
        return conved
Example #24
0
  def forward(self, imgsfeats, imgsfc7, wordclass):
    attn_buffer = None
    wordemb = self.emb_0(wordclass)
    # wordemb = self.emb_1(wordemb)
    x = wordemb.transpose(2, 1) # (100L, 512L, 15L)
    # print 'embedding:', x.size() # (100L, 512L, 15L) 
    batchsize, wordembdim, maxtokens = x.size()

    y = F.relu(self.imgproj(imgsfc7))
    y = y.unsqueeze(2).expand(batchsize, self.nfeats, maxtokens)
    # print 'img:', y.size() # (100L, 512L, 15L) 
    x = torch.cat([x, y], 1) # (100L, 1024L, 15L)
    # print 'concat emb and img:', x.size() # (100L, 1024L, 15L) 

    for i, conv in enumerate(self.convs):
      
      if(i == 0):
        # print x.size() # (100L, 1024L, 15L) 
        x = x.transpose(2, 1)
        residual = self.resproj(x)
        # print x.size() # (100L, 15L, 1024L)
        # print residual.size() # (100L, 15L, 1024L)
        residual = residual.transpose(2, 1)
        # print residual.size() # (100L, 512L, 15L)
        x = x.transpose(2, 1)
      else:
        residual = x

      x = F.dropout(x, p=self.dropout, training=self.training)

      # print 'layer:', i
      # print x.size() # (100L, 1024L, 15L) in layer 0, (100L, 512L, 15L) in layer 1 and layer 2
      x = conv(x)
      # print x.size() # (100L, 1024L, 19L)
      x = x[:,:,:-self.pad]
      # print x.size() # (100L, 1024L, 15L) 

      # print 'before glu:', x.size() # (100L, 1024L, 15L) 
      x = F.glu(x, dim=1)
      # print 'after glu:', x.size() # (100L, 512L, 15L) 

      if(self.is_attention):
        attn = self.attention[i]
        x = x.transpose(2, 1)
        x, attn_buffer = attn(x, wordemb, imgsfeats)
        x = x.transpose(2, 1)
    
      x = (x+residual) # *math.sqrt(.5)
      # print 'add res', x.size() # (100L, 512L, 15L) 

    x = x.transpose(2, 1)
    # print 'out of conv', x.size() # (100L, 15L, 512L)
  
    x = self.classifier(x)
    # print 'classisify:', x.size() # (100L, 9221L, 15L) 

    x = x.transpose(2, 1)
    # print 'return:', x.size() # (100L, 1024L, 15L) 

    return x, attn_buffer
Example #25
0
 def forward(self, input):
     residual = input
     output = self.residual_layer_1(input)
     output = F.glu(output, dim=1)
     output = self.residual_layer_2(output)
     output += residual
     return output
Example #26
0
    def forward(self, xs, xlens):
        """Forward computation.

        Args:
            xs (FloatTensor): `[B, T, input_dim (+Δ, ΔΔ)]`
            xlens (list): A list of length `[B]`
        Returns:
            xs (FloatTensor): `[B, T', out_ch * feat_dim]`
            xlens (list): A list of length `[B]`

        """
        bs, time, input_dim = xs.size()
        xs = xs.transpose(2, 1).unsqueeze(3)  # `[B, in_ch (input_dim), T, 1]`

        xs = self.layers(xs)  # `[B, out_ch, T, 1]`
        bs, out_ch, time, freq = xs.size()
        xs = xs.transpose(2, 1).contiguous().view(
            bs, time, -1)  # `[B, T, out_ch * feat_dim]`

        # weight normalization + GLU for the last fully-connected layer
        xs = F.glu(self.fc_glu(xs), dim=2)

        # Bridge layer
        if self.bridge is not None:
            xs = self.bridge(xs)

        # NOTE: no subsampling is conducted

        return xs, xlens
Example #27
0
 def forward(self, trg, encoder_conved, encoder_combined):
     """
     Get output and attention
     :param trg: trg = [batch size, trg len]
     :param encoder_conved: encoder_conved = encoder_combined = [batch size, src len, emb dim]
     :param encoder_combined:
     :return:
     """
     batch_size = trg.shape[0]
     trg_len = trg.shape[1]
     # create position tensor
     pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
     # pos = [batch size, trg len]
     # embed tokens and positions
     tok_embedded = self.tok_embedding(trg)
     pos_embedded = self.pos_embedding(pos)
     # tok_embedded = [batch size, trg len, emb dim]
     # pos_embedded = [batch size, trg len, emb dim]
     # combine embeddings by elementwise summing
     embedded = self.dropout(tok_embedded + pos_embedded)
     # embedded = [batch size, trg len, emb dim]
     # pass embedded through linear layer to go through emb dim -> hid dim
     conv_input = self.emb2hid(embedded)
     # conv_input = [batch size, trg len, hid dim]
     # permute for convolutional layer
     conv_input = conv_input.permute(0, 2, 1)
     # conv_input = [batch size, hid dim, trg len]
     batch_size = conv_input.shape[0]
     hid_dim = conv_input.shape[1]
     for i, conv in enumerate(self.convs):
         # apply dropout
         conv_input = self.dropout(conv_input)
         # need to pad so decoder can't "cheat"
         padding = torch.zeros(batch_size,
                               hid_dim,
                               self.kernel_size - 1).fill_(self.trg_pad_idx).to(self.device)
         padded_conv_input = torch.cat((padding, conv_input), dim=2)
         # padded_conv_input = [batch size, hid dim, trg len + kernel size - 1]
         # pass through convolutional layer
         conved = conv(padded_conv_input)
         # conved = [batch size, 2 * hid dim, trg len]
         # pass through GLU activation function
         conved = F.glu(conved, dim=1)
         # conved = [batch size, hid dim, trg len]
         # calculate attention
         attention, conved = self.calculate_attention(embedded,
                                                      conved,
                                                      encoder_conved,
                                                      encoder_combined)
         # attention = [batch size, trg len, src len]
         # apply residual connection
         conved = (conved + conv_input) * self.scale
         # conved = [batch size, hid dim, trg len]
         # set conv_input to conved for next loop iteration
         conv_input = conved
     conved = self.hid2emb(conved.permute(0, 2, 1))
     # conved = [batch size, trg len, emb dim]
     output = self.fc_out(self.dropout(conved))
     # output = [batch size, trg len, output dim]
     return output, attention
Example #28
0
    def forward(self, protein):
        #pos = torch.arange(0, protein.shape[1]).unsqueeze(0).repeat(protein.shape[0], 1).to(self.device)
        #protein = protein + self.pos_embedding(pos)
        #protein = [batch size, protein len,protein_dim]
        conv_input = self.fc(protein)
        # conv_input=[batch size,protein len,hid dim]
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1)
        #conv_input = [batch size, hid dim, protein len]
        for i, conv in enumerate(self.convs):
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))
            #conved = [batch size, 2*hid dim, protein len]

            #pass through GLU activation function
            conved = F.glu(conved, dim=1)
            #conved = [batch size, hid dim, protein len]

            #apply residual connection / high way
            conved = (conved + conv_input) * self.scale
            #conved = [batch size, hid dim, protein len]

            #set conv_input to conved for next loop iteration
            conv_input = conved

        conved = conved.permute(0, 2, 1)
        # conved = [batch size,protein len,hid dim]
        return conved
Example #29
0
    def forward(self, xs):
        """Forward pass.

        Args:
            xs (FloatTensor): `[B, T, d_model]`
        Returns:
            xs (FloatTensor): `[B, T, d_model]`

        """
        xs = xs.transpose(2, 1).contiguous()  # `[B, C, T]`
        if self.causal is not None:
            xs = self.causal(xs)
            xs = xs[:, :, :-(self.kernel_size - 1)]
        xs = self.pointwise_conv1(xs)  # `[B, 2 * C, T]`
        xs = xs.transpose(2, 1)  # `[B, T, 2 * C]`
        xs = F.glu(xs)  # `[B, T, C]`
        xs = xs.transpose(2, 1).contiguous()  # `[B, C, T]`
        xs = self.depthwise_conv(xs)  # `[B, C, T]`

        xs = self.norm(xs)
        xs = self.activation(xs)
        xs = self.pointwise_conv2(xs)  # `[B, C, T]`

        xs = xs.transpose(2, 1).contiguous()  # `[B, T, C]`
        return xs
Example #30
0
    def forward(self, src):
        """
        src: [B, L]
        """
        B, L = src.shape
        mask = (src != PAD_idx).long()
        # padded pos_tokens
        pos_tokens = mask.cumsum(dim=1) * mask

        emb = self.embedding(src)  # [B, L, emb_dim]
        pos_emb = self.pos_embedding(pos_tokens)  # [B, L, emb_dim]
        emb = self.dropout(emb + pos_emb)

        out = self.emb2hid(emb).permute(0, 2, 1)  # [B, h_dim, L]

        for conv in self.convs:
            skip_con = out
            out = out.masked_fill(mask.unsqueeze(1) == 0, 0.)
            out = conv(self.dropout(out))  # [B, h_dim*2, L]
            out = F.glu(out, dim=1)  # [B, h_dim, L]
            # residual connection
            out = (out + skip_con) * self.scale  # [B, h_dim, L]

        out = self.hid2emb(out.permute(0, 2,
                                       1))  # encoder out z. [B, L, emb_dim]
        out = GradMultiply.apply(out, 1.0 / (2.0 * self.n_layers))
        attn_value = (out + emb) * self.scale  # attention value (z+e)

        return out, attn_value, mask
    def forward(self,x):
        x1 = self.fc(x)

        if self.add_batch_norm:
            x1 = self.batch_norm(x1) 

        x = th.cat((x, x1), 1)
        
        return F.glu(x,1)
Example #32
0
    def _forward(self, input_tokens, positions, encoder_out):
        # split and transpose encoder outputs
        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed tokens and positions
        x = self.embed_tokens(input_tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = self._transpose_unless_incremental_eval(x)

        # temporal convolutions
        avg_attn_scores = None
        num_attn_layers = len(self.attention)
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = conv.remove_future_timesteps(x)
            x = F.glu(x)

            # attention
            if attention is not None:
                x = self._transpose_unless_incremental_eval(x)

                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b))
                attn_scores = attn_scores / num_attn_layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)

                x = self._transpose_unless_incremental_eval(x)

            # residual
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = self._transpose_unless_incremental_eval(x)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc3(x)

        return x, avg_attn_scores
Example #33
0
    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)
            if attention is not None:
                x = attention(x)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
        }
Example #34
0
    def forward(self, src_tokens):
        positions = Variable(make_positions(src_tokens.data, self.dictionary.pad(),
                                            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE))

        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=-1)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return x, y
Example #35
0
    def forward(self, prev_output_tokens, encoder_out_dict):
        encoder_out = encoder_out_dict['encoder']['encoder_out']
        trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None

        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed positions
        positions = self.embed_positions(prev_output_tokens)

        # embed tokens and positions
        x = self.embed_tokens(prev_output_tokens) + positions
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        avg_attn_scores = None
        for proj, conv, attention, selfattention, attproj in zip(
            self.projections, self.convolutions, self.attention, self.selfattention, self.attproj
        ):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=2)

            # attention
            if attention is not None:
                r = x
                x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b)
                x = x + r
                if not self.training and self.need_attn:
                    if avg_attn_scores is None:
                        avg_attn_scores = attn_scores
                    else:
                        avg_attn_scores.add_(attn_scores)

            if selfattention is not None:
                x = selfattention(x)

            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        if not self.pretrained:
            x = self.fc3(x)

        # fusion gating
        if self.pretrained:
            trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out)
            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
            gate1 = self.gate1(y)
            gate2 = self.gate2(y)
            gated_x1 = gate1 * x
            gated_x2 = gate2 * self.pretrained_outputs["out"]
            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
            fusion = self.joining(fusion)
            fusion_output = self.fc3(fusion)
            return fusion_output, avg_attn_scores
        else:
            return x, avg_attn_scores