Example #1
0
    def __init__(self,
                 input_size=(36, 256),
                 conv_dim=64,
                 repeat_num=5,
                 num_speakers=10):
        super(Discriminator, self).__init__()

        self.num_speakers = num_speakers
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Initial layers.
        self.conv_layer_1 = nn.Sequential(
            nn.Conv2d(in_channels=1,
                      out_channels=128,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1), nn.GLU(dim=1))
        self.conv_gated_1 = nn.Sequential(
            nn.Conv2d(in_channels=1,
                      out_channels=128,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=1), nn.GLU(dim=1))

        # Down-sampling layers.
        self.down_sample_1 = DownsampleBlock(dim_in=64,
                                             dim_out=256,
                                             kernel_size=(3, 3),
                                             stride=(2, 2),
                                             padding=1,
                                             bias=False)

        self.down_sample_2 = DownsampleBlock(dim_in=128,
                                             dim_out=512,
                                             kernel_size=(3, 3),
                                             stride=(2, 2),
                                             padding=1,
                                             bias=False)

        self.down_sample_3 = DownsampleBlock(dim_in=256,
                                             dim_out=1024,
                                             kernel_size=(3, 3),
                                             stride=(2, 2),
                                             padding=1,
                                             bias=False)

        self.down_sample_4 = DownsampleBlock(dim_in=512,
                                             dim_out=1024,
                                             kernel_size=(1, 5),
                                             stride=(1, 1),
                                             padding=(0, 2),
                                             bias=False)

        # Fully connected layer.
        self.fully_connected = nn.Linear(in_features=512, out_features=1)

        # Projection.
        self.projection = nn.Linear(self.num_speakers * 2, 512)
    def __init__(self, inputdim, outputdim, **kwargs):
        super().__init__()
        self._filtersizes = kwargs.get('filtersizes', [3, 3, 3, 3, 3])
        self._filter = [1] + kwargs.get('filter', [16, 24, 32, 16, 16])
        self._pooling = kwargs.get('pooling', [2, 2, 2, 2, 2])
        self._linear_dim = kwargs.get('lineardim', 128)
        self._cmvn = kwargs.get(
            'cmvn', True)  # Use or not use rollwing window standardization
        self.norm = MovingAvgNorm(100) if self._cmvn else nn.Sequential()
        net = nn.ModuleList()
        for nl, (h0, h1, filtersize, poolingsize) in enumerate(
                zip(self._filter, self._filter[1:], self._filtersizes,
                    self._pooling)):
            if nl == 0:
                net.append(
                    nn.Sequential(
                        nn.GroupNorm(1, h0),
                        nn.Conv2d(h0,
                                  h1 * 2,
                                  kernel_size=filtersize,
                                  padding=filtersize // 2,
                                  stride=1),
                        nn.GLU(1),
                    ))
            else:
                net.append(
                    nn.Sequential(
                        nn.GroupNorm(1, h0),
                        nn.Conv2d(h0,
                                  h1 * 2,
                                  kernel_size=1,
                                  padding=0,
                                  stride=1),
                        nn.GLU(1),
                        nn.GroupNorm(1, h1),
                        nn.Conv2d(h1,
                                  h1 * 2,
                                  kernel_size=filtersize,
                                  padding=filtersize // 2,
                                  stride=1),
                        nn.GLU(1),
                    ))
            net.append(nn.MaxPool2d(kernel_size=poolingsize, ceil_mode=True))
        self.network = nn.Sequential(*net)
        with torch.no_grad():
            feature_output = self.network(torch.randn(1, 1, 300,
                                                      inputdim)).shape
            feature_output = feature_output[1] * feature_output[3]
            # LeftOverFeatDim * ChannelDim

        self.timepool = nn.AdaptiveAvgPool2d((1, None))
        self.outputlayer = nn.Sequential(
            nn.Conv1d(feature_output, self._linear_dim * 2, kernel_size=1),
            nn.GLU(1), nn.Dropout(0.5),
            nn.Conv1d(self._linear_dim, outputdim, kernel_size=1, groups=1))

        self.network.apply(init_weights)
        self.outputlayer.apply(init_weights)
Example #3
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 residual_dim,
                 gate_dim,
                 skip_dim,
                 kernel_size,
                 down_sample_factor=2,
                 dilation_rate=None):
        """ Initialize Encoder module

        Args:
            input_dim (int): Number of channels of input tensor
            output_dim (int): Number of channels of output tensor
            skip_dim (int): Number of channels of skip connection
            kernel_size (int): Size of kernel
            down_sample_factor: Upsample factor
            dilation_rate: List of dilation rate for WNCell

        Returns:
            Tensor: Output tensor

        """
        super().__init__()
        self.down_sample_factor = down_sample_factor
        if dilation_rate is None:
            dilation_rate = [1, 2, 4, 8, 16, 32]
        self.input_layer = nn.Sequential(
            ConvNorm(input_dim, 2 * residual_dim, kernel_size=15),
            nn.GLU(dim=1))
        if self.down_sample_factor > 1:
            self.down_sample = nn.ModuleList()
            assert down_sample_factor % 2 == 0
            for i in range(down_sample_factor // 2):
                self.down_sample.extend([
                    ConvNorm(residual_dim,
                             2 * residual_dim,
                             kernel_size=8,
                             stride=2),
                    nn.InstanceNorm1d(2 * residual_dim, momentum=0.8),
                    nn.GLU(dim=1)
                ])
            self.down_sample = nn.Sequential(*self.down_sample)

        self.WN = nn.ModuleList()
        for d in dilation_rate:
            self.WN.append(
                WNCell(residual_dim=residual_dim,
                       gate_dim=gate_dim,
                       skip_dim=skip_dim,
                       kernel_size=kernel_size,
                       dilation=d))

        self.output_layer = nn.Sequential(
            ConvNorm(skip_dim, 2 * output_dim, kernel_size=kernel_size),
            nn.InstanceNorm1d(2 * output_dim, momentum=0.8), nn.GLU(dim=1),
            ConvNorm(output_dim, output_dim, kernel_size=1))
def GatedLinear(in_features, out_features, dropout=0.0, bias=True):
    """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units"""
    return nn.Sequential(
        Linear(in_features, out_features * 4, dropout, bias),
        nn.GLU(),
        Linear(out_features * 2, out_features * 2, dropout, bias),
        nn.GLU(),
        Linear(out_features, out_features, dropout, bias),
    )
Example #5
0
 def __init__(self, num_speakers=10):
     super(Discriminator, self).__init__()
     self.num_speakers = num_speakers
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     # 初始化层
     self.conv_layer_1 = nn.Sequential(
         nn.Conv2d(in_channels=1,
                   out_channels=128,
                   kernel_size=(3, 3),
                   stride=(1, 1),
                   padding=1), nn.GLU(dim=1))
     # 下采样层
     self.down_sample_1 = nn.Sequential(
         nn.Conv2d(in_channels=64,
                   out_channels=256,
                   kernel_size=(3, 3),
                   stride=(2, 2),
                   padding=(1, 1),
                   bias=False),
         nn.InstanceNorm2d(num_features=256,
                           affine=True,
                           track_running_stats=True), nn.GLU(dim=1))
     self.down_sample_2 = nn.Sequential(
         nn.Conv2d(in_channels=128,
                   out_channels=512,
                   kernel_size=(3, 3),
                   stride=(2, 2),
                   padding=(1, 1),
                   bias=False),
         nn.InstanceNorm2d(num_features=512,
                           affine=True,
                           track_running_stats=True), nn.GLU(dim=1))
     self.down_sample_3 = nn.Sequential(
         nn.Conv2d(in_channels=256,
                   out_channels=1024,
                   kernel_size=(3, 3),
                   stride=(2, 2),
                   padding=(1, 1),
                   bias=False),
         nn.InstanceNorm2d(num_features=1024,
                           affine=True,
                           track_running_stats=True), nn.GLU(dim=1))
     self.down_sample_4 = nn.Sequential(
         nn.Conv2d(in_channels=512,
                   out_channels=1024,
                   kernel_size=(1, 5),
                   stride=(1, 1),
                   padding=(0, 2),
                   bias=False),
         nn.InstanceNorm2d(num_features=1024,
                           affine=True,
                           track_running_stats=True), nn.GLU(dim=1))
     # 全连接层
     self.fully_connected = nn.Linear(in_features=512, out_features=1)
     # 映射层.
     self.projection = nn.Linear(self.num_speakers, 512)
Example #6
0
    def __init__(self,
                 kernel_size,
                 in_ch,
                 out_ch,
                 bottlececk_dim=0,
                 dropout=0.):

        super().__init__()

        self.conv_residual = None
        if in_ch != out_ch:
            self.conv_residual = nn.utils.weight_norm(nn.Conv2d(
                in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)),
                                                      name='weight',
                                                      dim=0)
            self.dropout_residual = nn.Dropout(p=dropout)

        self.pad_left = nn.ConstantPad2d((0, 0, kernel_size - 1, 0), 0)

        layers = OrderedDict()
        if bottlececk_dim == 0:
            layers['conv'] = nn.utils.weight_norm(nn.Conv2d(
                in_channels=in_ch,
                out_channels=out_ch * 2,
                kernel_size=(kernel_size, 1)),
                                                  name='weight',
                                                  dim=0)
            # TODO(hirofumi0810): padding?
            layers['dropout'] = nn.Dropout(p=dropout)
            layers['glu'] = nn.GLU()

        elif bottlececk_dim > 0:
            layers['conv_in'] = nn.utils.weight_norm(nn.Conv2d(
                in_channels=in_ch,
                out_channels=bottlececk_dim,
                kernel_size=(1, 1)),
                                                     name='weight',
                                                     dim=0)
            layers['dropout_in'] = nn.Dropout(p=dropout)
            layers['conv_bottleneck'] = nn.utils.weight_norm(nn.Conv2d(
                in_channels=bottlececk_dim,
                out_channels=bottlececk_dim,
                kernel_size=(kernel_size, 1)),
                                                             name='weight',
                                                             dim=0)
            layers['dropout'] = nn.Dropout(p=dropout)
            layers['glu'] = nn.GLU()
            layers['conv_out'] = nn.utils.weight_norm(nn.Conv2d(
                in_channels=bottlececk_dim,
                out_channels=out_ch * 2,
                kernel_size=(1, 1)),
                                                      name='weight',
                                                      dim=0)
            layers['dropout_out'] = nn.Dropout(p=dropout)

        self.layers = nn.Sequential(layers)
Example #7
0
    def __init__(self, num_speakers=10):
        super(Discriminator, self).__init__()

        self.num_speakers = num_speakers
        # Initial layers.
        self.conv_layer_1 = nn.Sequential(
            nn.Conv2d(in_channels=1,
                      out_channels=256,
                      kernel_size=(3, 3),
                      stride=(1, 2),
                      padding=1), nn.GLU(dim=1))
        #self.conv1 = nn.Conv2d(1, 128, kernel_size= (3,3), stride = 1, padding= 1)
        #self.gate1 = nn.Conv2d(1, 128, kernel_size = 3, stride = 1, padding = 1)

        # Down-sampling layers.
        self.down_sample_1 = nn.Sequential(
            nn.Conv2d(in_channels=128,
                      out_channels=512,
                      kernel_size=(3, 3),
                      stride=(2, 2),
                      padding=(1, 1),
                      bias=False), nn.InstanceNorm2d(512, affine=True),
            nn.GLU(dim=1))
        #self.down_sample_1 = DisDown(128, 256, kernel_size = 3, stride = 2, padding = 1)

        self.down_sample_2 = nn.Sequential(
            nn.Conv2d(in_channels=256,
                      out_channels=1024,
                      kernel_size=(3, 3),
                      stride=(2, 2),
                      padding=(1, 1),
                      bias=False), nn.InstanceNorm2d(1024, affine=True),
            nn.GLU(dim=1))
        #self.down_sample_2 = DisDown(256, 512, kernel_size = 3, stride = 2, padding = 1)

        self.down_sample_3 = nn.Sequential(
            nn.Conv2d(in_channels=512,
                      out_channels=2048,
                      kernel_size=(3, 3),
                      stride=(2, 2),
                      padding=(1, 1),
                      bias=False), nn.InstanceNorm2d(2048, affine=True),
            nn.GLU(dim=1))
        #self.down_sample_3 = DisDown(512, 1024, kernel_size = 3, stride = 2, padding = 1)
        self.down_sample_4 = nn.Sequential(
            nn.Conv2d(in_channels=1024,
                      out_channels=1024,
                      kernel_size=(1, 5),
                      stride=(1, 2),
                      padding=(0, 2),
                      bias=False), nn.GLU(dim=1))
        #self.down_sample_4 = DisDown(1024, 512, kernel_size = (1,5), stride = 1, padding = (0,2))
        # Fully connected layer.
        self.fully_connected = nn.Linear(in_features=512,
                                         out_features=num_speakers)
Example #8
0
 def __init__(self, latent_dim, num_classes=0, embedding_dim=EMBEDDING_DIM):
     super().__init__()
     self.c1 = nn.Sequential(
         nn.ConvTranspose2d(latent_dim, latent_dim * 2, 4),
         norm_class(latent_dim * 2), nn.GLU(dim=1))
     self.num_classes = num_classes
     if num_classes == 0:
         return
     self.embed = nn.Embedding(num_classes, embedding_dim)
     self.c2 = nn.Sequential(
         nn.Conv2d(latent_dim + embedding_dim, latent_dim * 2, 1),
         norm_class(latent_dim * 2), nn.GLU(dim=1))
Example #9
0
 def __init__(self, in_chan, out_chan, kernel=3, stride=1, pad=1, dil=1, dropout=0., groups=1, k=1, **kwargs):
   super().__init__()
   assert k in [1, 2], 'Handle only k = 1 or 2'
   self.conv = nn.Sequential(nn.Conv1d(in_chan, out_chan, kernel, stride=stride, padding=pad, dilation=dil, groups=groups),
                             nn.BatchNorm1d(out_chan),
                             nn.ReLU(inplace=True) if k == 1 else nn.GLU(dim=1),
                             nn.Dropout(dropout))
Example #10
0
    def __init__(self, config: Config, embed_dim: int) -> None:
        super().__init__(config)

        out_channels = config.cnn.kernel_num
        kernel_sizes = config.cnn.kernel_sizes

        conv_layers = []
        linear_layers = []
        in_channels = embed_dim

        for k in kernel_sizes:
            assert (k - 1) % 2 == 0
            proj = (
                nn.Linear(in_channels, out_channels)
                if in_channels != out_channels
                else None
            )
            linear_layers.append(proj)
            single_conv = nn.Conv1d(
                in_channels, 2 * out_channels, k, padding=int((k - 1) / 2)
            )
            conv_layers.append(single_conv)
            in_channels = out_channels

        self.convs = nn.ModuleList(conv_layers)
        self.projections = nn.ModuleList(linear_layers)
        self.glu = nn.GLU(dim=1)

        self.representation_dim = out_channels
        self.dropout = nn.Dropout(p=config.dropout)
Example #11
0
 def __init__(self,
              n_inputs,
              n_outputs,
              kernel_size,
              stride,
              dilation,
              padding,
              dropout=0.2,
              model='Gen'):
     super(ConvLayer, self).__init__()
     self.conv = weight_norm(
         nn.Conv1d(n_inputs,
                   n_outputs * 2,
                   kernel_size,
                   stride=stride,
                   padding=padding,
                   dilation=dilation))
     self.dilation = dilation
     self.padding = padding
     self.model = model
     self.glu = nn.GLU(dim=1)
     self.dropout = nn.Dropout(dropout)
     self.trans = nn.Conv1d(n_inputs, n_outputs,
                            1) if n_inputs != n_outputs else None
     self.init_weights()
    def __init__(self,
                 in_channel,
                 channel,
                 kernel_size,
                 conv='wnconv2d',
                 activation=nn.ELU,
                 dropout=0.1,
                 auxiliary_channel=0,
                 condition_dim=0):
        super(GatedResBlock, self).__init__()

        if conv == 'wnconv2d':
            conv_module = partial(tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(padding='same')))
        elif conv == 'causal_downright':
            conv_module = partial(CausalConv2d, padding='downright')

        elif conv == 'causal':
            conv_module = partial(CausalConv2d, padding='causal')

        self.activation = activation(inplace=True)
        self.conv1 = conv_module(in_channel, channel, kernel_size)

        if auxiliary_channel > 0:
            self.aux_conv = WNConv2d(auxiliary_channel, channel, 1)

        self.dropout = nn.Dropout(dropout)

        self.conv2 = conv_module(channel, in_channel * 2, kernel_size)

        if condition_dim > 0:
            # self.condition = nn.Linear(condition_dim, in_channel * 2, bias=False)
            self.condition = WNConv2d(condition_dim, in_channel * 2, 1, bias=False)
        self.gate = nn.GLU(1)
Example #13
0
    def __init__(self,
                 embed_dim,
                 conv_dim,
                 num_heads,
                 kernel_size,
                 weight_dropout=0.1,
                 dropout=0.3,
                 input_dropout=0.0,
                 weight_softmax=True,
                 encoder_glu=False,
                 normalize_before=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.conv_dim = conv_dim

        if encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None

        self.conv = DynamicConv(self.conv_dim,
                                kernel_size,
                                padding_l=kernel_size - 1,
                                weight_softmax=weight_softmax,
                                num_heads=num_heads,
                                weight_dropout=weight_dropout)
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.input_dropout = input_dropout
        self.normalize_before = normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)
Example #14
0
    def __init__(self, args, kernel_size=1):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.conv_dim = args.encoder_conv_dim

        if args.encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None

        self.conv = TaLKConv(self.conv_dim,
                             offsets_dropout=args.weight_dropout,
                             decode=False,
                             num_heads=args.encoder_attention_heads,
                             min_len_left=kernel_size,
                             min_len_right=kernel_size)

        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)

        self.layer_norms = nn.ModuleList(
            [LayerNorm(self.embed_dim) for _ in range(2)])
Example #15
0
    def __init__(self,
                 latent_dim=32,
                 hidden_dim=256,
                 input_dim=784,
                 nb_layers=2,
                 dropout_p=0.):
        """
        A simple MLP encoder with gated activations.
        :param latent_dim: input features
        :param hidden_dim: hidden features
        :param input_dim: nb output features OR parameters for the likelihood distribution
        :param nb_layers: excluding the output projection
        :param dropout_p:
        """
        super().__init__()

        layers = []
        for i in range(nb_layers):
            inter_dim = latent_dim if i == 0 else hidden_dim
            layers += [
                nn.Linear(inter_dim, hidden_dim * 2),
                nn.GLU(dim=1),
                nn.Dropout(dropout_p)
            ]

        layers.append(nn.Linear(hidden_dim, input_dim))

        self.layers = nn.Sequential(*layers)
Example #16
0
    def __init__(self, in_feature=1):
        super().__init__()

        self.block = nn.Sequential(
            # input: (1 x 1 x 24 x 128)
            ConvLayer2D(in_filter=in_feature,
                        out_filter=256,
                        kernel=(3, 3),
                        stride=(1, 2)),
            nn.GLU(dim=1),
            # input: (1 x 128 x 24 x 64)
            Downsample2D(in_filter=128,
                         out_filter=256,
                         kernel=(3, 3),
                         stride=(2, 2)),
            # input: (1 x 256 x 12 x 32)
            Downsample2D(in_filter=256,
                         out_filter=512,
                         kernel=(3, 3),
                         stride=(2, 2)),
            # input: (1 x 512 x 6 x 16)
            #nn.ZeroPad2d((3, 2, 0, 1)),
            Downsample2D(in_filter=512,
                         out_filter=1024,
                         kernel=(6, 3),
                         stride=(1, 2)),
            # input: (1 x 1024 x 1 x 1)
            PermuteBlock(),
            # input?: (1 x 1024)
            nn.Linear(1024, 1)
            #nn.Sigmoid()
        )
Example #17
0
    def __init__(self, args, kernel_size=0):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.conv_dim = args.encoder_conv_dim
        padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)

        if args.encoder_glu:
            self.linear1 = Linear(self.embed_dim, 2*self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = None
        if args.encoder_conv_type == 'lightweight':
            self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l,
                                             weight_softmax=args.weight_softmax,
                                             num_heads=args.encoder_attention_heads,
                                             weight_dropout=args.weight_dropout)
        elif args.encoder_conv_type == 'dynamic':
            self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l,
                                         weight_softmax=args.weight_softmax,
                                         num_heads=args.encoder_attention_heads,
                                         weight_dropout=args.weight_dropout)
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
Example #18
0
    def __init__(self, channels, kernel_size=31):
        super(ConvModule, self).__init__()
        assert (kernel_size - 1) % 2 == 0

        self.pointwise_conv1 = nn.Conv1d(
            channels,
            2 * channels,
            kernel_size=1,
            stride=1,
            padding=0,
        )
        self.depthwise_conv = nn.Conv1d(
            channels,
            channels,
            kernel_size=kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            groups=channels,
        )
        self.batch_norm = nn.BatchNorm1d(channels)
        self.pointwise_conv2 = nn.Conv1d(channels,
                                         channels,
                                         kernel_size=1,
                                         stride=1,
                                         padding=0)
        self.glu_act = nn.GLU(dim=1)
        self.swish_act = Swish()
Example #19
0
 def __init__(self):
     super(NNActivationModule, self).__init__()
     self.activations = nn.ModuleList([
         nn.ELU(),
         nn.Hardshrink(),
         nn.Hardsigmoid(),
         nn.Hardtanh(),
         nn.Hardswish(),
         nn.LeakyReLU(),
         nn.LogSigmoid(),
         # nn.MultiheadAttention(),
         nn.PReLU(),
         nn.ReLU(),
         nn.ReLU6(),
         nn.RReLU(),
         nn.SELU(),
         nn.CELU(),
         nn.GELU(),
         nn.Sigmoid(),
         nn.SiLU(),
         nn.Mish(),
         nn.Softplus(),
         nn.Softshrink(),
         nn.Softsign(),
         nn.Tanh(),
         nn.Tanhshrink(),
         # nn.Threshold(0.1, 20),
         nn.GLU(),
         nn.Softmin(),
         nn.Softmax(),
         nn.Softmax2d(),
         nn.LogSoftmax(),
         # nn.AdaptiveLogSoftmaxWithLoss(),
     ])
Example #20
0
    def __init__(
        self,
        wshare,
        n_feat,
        dropout_rate,
        kernel_size,
        use_kernel_mask=False,
        use_bias=False,
    ):
        """Construct Dynamic Convolution layer."""
        super(DynamicConvolution, self).__init__()

        assert n_feat % wshare == 0
        self.wshare = wshare
        self.use_kernel_mask = use_kernel_mask
        self.dropout_rate = dropout_rate
        self.kernel_size = kernel_size
        self.attn = None

        # linear -> GLU -- -> lightconv -> linear
        #               \        /
        #                 Linear
        self.linear1 = nn.Linear(n_feat, n_feat * 2)
        self.linear2 = nn.Linear(n_feat, n_feat)
        self.linear_weight = nn.Linear(n_feat, self.wshare * 1 * kernel_size)
        nn.init.xavier_uniform(self.linear_weight.weight)
        self.act = nn.GLU()

        # dynamic conv related
        self.use_bias = use_bias
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(n_feat))
Example #21
0
    def __init__(self, dim_domain=16):
        super(DiscriminatorV2, self).__init__()
        self.conv_in=nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv_in_gate=nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.glu=nn.GLU(dim=1)

        height, width=36, 128
        num_downsamples=4
        in_channels=128
        down_samples=[]
        for i in range(num_downsamples-1):
            down_samples.append(DownSample_Block(in_channels=in_channels, out_channels=in_channels*2, kernel_size=3, stride=2, padding=1))
            in_channels*=2
        down_samples.append(DownSample_Block(in_channels=in_channels, out_channels=in_channels, kernel_size=(1, 5), stride=1, padding=(0, 2)))
        self.down_samples=nn.Sequential(*down_samples)
        #TODO check output shape [5, 32]
        self.fc=nn.Linear(in_channels, 1)
        self.projection=nn.Linear(dim_domain, in_channels)

        # PatchGAN classifier
        # TODO floor or ceil
        kernel_size_0 = math.ceil(height / np.power(2, num_downsamples-1)) # 4
        kernel_size_1 = math.ceil(width / np.power(2, num_downsamples-1)) # 16
        # make it a single value
        self.conv_clf_spks = nn.Conv2d(in_channels, 4, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False)  # for num_speaker
Example #22
0
    def forward(self, x):
        time = x.shape[2]
        num_featuers = x.shape[3]
        x = self.conv1(x)
        x = nn.GLU(dim=1)(x)

        for i in range(3):
            x = self.__getattr__("downsample_block" + str(i + 1))(x)

        x = self.conv2(x)
        x = nn.InstanceNorm2d(x.shape[1])(x)
        x = nn.GLU(dim=1)(x)
        x = self.conv3(x)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        return nn.Sigmoid()(x)
Example #23
0
    def forward(self, x):
        time = x.shape[2]
        num_featuers = x.shape[3]
        x = self.conv1(x)
        x = nn.GLU(dim=1)(x)
        for i in range(2):
            x = self.__getattr__("downsample_block" + str(i + 1))(x)
        x = x.reshape((x.shape[0], -1, int(int((time + 1) / 2 + 1) / 2), 1))
        x = self.conv2(x)
        x = nn.InstanceNorm2d(x.shape[1])(x)

        for i in range(6):
            x = self.__getattr__("residual_block" + str(i + 1))(x)

        x = self.conv3(x)
        x = nn.InstanceNorm2d(x.shape[1])(x)
        x = x.reshape(x.shape[0], -1, int(int((time + 1) / 2 + 1) / 2),
                      int(int((num_featuers + 1) / 2 + 1) / 2))

        for i in range(2):
            x = self.__getattr__("upsample_block" + str(i + 1))(x)

        x = self.conv4(x)

        x = x.reshape(x.shape[0], x.shape[3], x.shape[2], x.shape[1])

        return self.output(x)
Example #24
0
    def __init__(self,
                 in_channel,
                 channel,
                 kernel_size,
                 conv='wnconv2d',
                 dropout=0.1,
                 condition_dim=0,
                 aux_channels=0):
        super().__init__()

        assert conv in [
            'wnconv2d', 'causal_downright', 'causal'
        ], "Invalid conv argument [wnconv2d, causal_downright, causal]"
        # partial is amazing! should use more!
        if conv == 'wnconv2d':
            conv_builder = partial(WNConv2d, padding=kernel_size // 2)
        elif conv == 'causal_downright':
            conv_builder = partial(CausalConv2d, padding='downright')
        elif conv == 'causal':
            conv_builder = partial(CausalConv2d, padding='causal')

        self.conv1 = conv_builder(in_channel, channel, kernel_size)
        self.bn1 = nn.BatchNorm2d(channel)
        self.conv2 = conv_builder(channel, in_channel * 2, kernel_size)
        self.bn2 = nn.BatchNorm2d(in_channel * 2)
        self.drop1 = nn.Dropout(dropout)

        if aux_channels > 0:
            self.aux_conv = WNConv2d(aux_channels, channel, 1)

        if condition_dim > 0:
            self.convc = WNConv2d(condition_dim, in_channel * 2, 1, bias=False)
        self.gate = nn.GLU(1)  # 0 -> 1 === ReZero -> Residual
Example #25
0
 def __init__(self, conv, p):
     super().__init__()
     self.conv = conv
     nn.init.kaiming_normal_(self.conv.weight)
     self.conv = weight_norm(self.conv)
     self.act = nn.GLU(1)
     self.dropout = nn.Dropout(p, inplace=True)
Example #26
0
    def __init__(
        self,
        *,
        chan_in,
        chan_out=3,
        num_upsamples=4,
        end_glu=True,
    ):
        super().__init__()

        self.layers = nn.ModuleList([])
        final_chan = chan_out
        chans = chan_in

        for ind in range(num_upsamples):
            last_layer = ind == (num_upsamples - 1)
            chan_out = chans if (not last_layer or end_glu) else final_chan * 2
            layer = nn.Sequential(upsample(),
                                  nn.Conv2d(chans, chan_out, 3, padding=1),
                                  nn.GLU(dim=1))
            self.layers.append(layer)
            chans //= 2

        if end_glu:
            self.layers.append(nn.Conv2d(chans, final_chan, 3, padding=1))
Example #27
0
    def __init__(self,
                 d_model,
                 kernel_size,
                 num_heads,
                 dropout,
                 weight_softmax=True):
        super(LConvBlock, self).__init__()
        self.embed_dim = d_model
        padding_l = (kernel_size // 2 if kernel_size % 2 == 1 else
                     ((kernel_size - 1) // 2, kernel_size // 2))

        self.act_linear = LinearNorm(self.embed_dim,
                                     2 * self.embed_dim,
                                     bias=True)
        self.act = nn.GLU()

        self.conv_layer = LightweightConv(
            self.embed_dim,
            kernel_size,
            padding_l=padding_l,
            weight_softmax=weight_softmax,
            num_heads=num_heads,
            weight_dropout=dropout,
        )

        self.fc1 = LinearNorm(self.embed_dim, 4 * self.embed_dim, bias=True)
        self.fc2 = LinearNorm(4 * self.embed_dim, self.embed_dim, bias=True)
        self.layer_norm = nn.LayerNorm(self.embed_dim)
Example #28
0
    def __init__(self, args, tgt_dict):
        super().__init__()
        self.args = args

        feature_enc_layers = eval(args.conv_feature_layers)
        self.embed = feature_enc_layers[-1][0]

        self.feature_extractor = ConvFeatureExtractionModel(
            conv_layers=feature_enc_layers,
            dropout=0.0,
            mode=args.extractor_mode,
            conv_bias=args.conv_bias,
        )

        self.post_extract_proj = (nn.Linear(self.embed, args.encoder_embed_dim)
                                  if self.embed != args.encoder_embed_dim else
                                  None)

        self.mask_prob = args.mask_prob
        self.mask_selection = args.mask_selection
        self.mask_other = args.mask_other
        self.mask_length = args.mask_length
        self.no_mask_overlap = args.no_mask_overlap
        self.mask_min_space = args.mask_min_space

        self.mask_channel_prob = args.mask_channel_prob
        self.mask_channel_selection = args.mask_channel_selection
        self.mask_channel_other = args.mask_channel_other
        self.mask_channel_length = args.mask_channel_length
        self.no_mask_channel_overlap = args.no_mask_channel_overlap
        self.mask_channel_min_space = args.mask_channel_min_space

        self.dropout_input = nn.Dropout(args.dropout_input)
        self.dropout_features = nn.Dropout(args.dropout_features)

        self.feature_grad_mult = args.feature_grad_mult

        self.n_negatives = args.num_negatives
        self.cross_sample_negatives = args.cross_sample_negatives
        self.negatives_from_everywhere = args.negatives_from_everywhere

        self.logit_temp = args.logit_temp

        final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim

        self.project_q = nn.Linear(self.embed, final_dim)

        self.mask_emb = nn.Parameter(
            torch.FloatTensor(args.encoder_embed_dim).uniform_())

        self.encoder = TransformerEncoder(args)
        self.layer_norm = LayerNorm(self.embed)

        self.target_glu = None
        if args.target_glu:
            self.target_glu = nn.Sequential(
                nn.Linear(final_dim, final_dim * 2), nn.GLU())

        self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
        self.phone_proj = nn.Linear(args.encoder_embed_dim, len(tgt_dict))
Example #29
0
    def __init__(
        self,
        attention_dropout,
        decoder_attention_heads,
        self_attention_heads,
        decoder_conv_dim,
        # ARBABU: need to remove these two type parameters
        decoder_conv_type,
        attention_type,
        self_attention_type,
        decoder_embed_dim,
        decoder_ffn_embed_dim,
        decoder_glu,
        decoder_normalize_before,
        dropout,
        input_dropout,
        relu_dropout,
        need_attention,
        convolution_type,
        conv=None,
        self_attention=None,
        attention=None,
    ):
        super().__init__()
        self.embed_dim = decoder_embed_dim
        self.conv_dim = decoder_conv_dim
        if decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim)
            self.act = PlaceholderIdentity()
        self.conv = conv
        self.linear2 = Linear(self.conv_dim, self.embed_dim)

        self.dropout = dropout
        self.relu_dropout = relu_dropout
        self.input_dropout = input_dropout
        self.normalize_before = decoder_normalize_before
        self.conv_layer_norm = LayerNorm(self.embed_dim)

        if attention is None:
            self.no_encoder_attn = True
            self.encoder_attn = PlaceholderAttentionIdentity()
            self.encoder_attn_layer_norm = PlaceholderIdentity()
        else:
            self.no_encoder_attn = False
            self.encoder_attn = attention
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
        if self_attention is None:
            self.has_self_attn = False
            self.self_attn = PlaceholderAttentionIdentity()
        else:
            self.has_self_attn = True
            self.self_attn = self_attention
        self.fc1 = Linear(self.embed_dim, decoder_ffn_embed_dim)
        self.fc2 = Linear(decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = need_attention
Example #30
0
    def __init__(self, init_channels, cond_dim, z_dim, n_upsample=4):
        """Init.

        Args:
            init_channels(int): # channels of initial representation.
            cond_dim(int): dimension of conditioning variable.
            z_dim(int): dimension of noise.
            n_upsamples(int, optional): # upsampling blocks, default=`4`.

        Raises:
            AssertionError: 2 ** n_upsample is greater init_channels.
        """

        super().__init__()

        # use linear layer to project input to 4 x 4 x init_channels
        # view is used to get appropriate shape in forward()
        self.fc = nn.Sequential(
            nn.Linear(cond_dim + z_dim, 4 * 4 * init_channels * 2, bias=False),
            nn.BatchNorm1d(init_channels * 4 * 4 * 2),
            nn.GLU(1),
        )

        # halve channels with each block
        channels = [init_channels // 2**i for i in range(n_upsample + 1)]
        assert channels[-1] > 0, 'Too many upsampling blocks / Too few channels'
        self.upsampling_blocks = nn.Sequential(*[
            UpsamplingBlock(in_ch, out_ch, 3)
            for in_ch, out_ch in zip(channels[:-1], channels[1:])
        ])