Ejemplo n.º 1
0
    def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs):
        super(ResNetSE, self).__init__()

        print('Embedding size is %d, encoder %s.'%(nOut, encoder_type))
        
        self.inplanes   = num_filters[0]
        self.encoder_type = encoder_type
        self.n_mels     = n_mels
        self.log_input  = log_input

        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.bn1 = nn.BatchNorm2d(num_filters[0])
        

        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2))

        self.instancenorm   = nn.InstanceNorm1d(n_mels)
        self.torchfb        = torch.nn.Sequential(
                PreEmphasis(),
                torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels)
                )

        outmap_size = int(self.n_mels/8)

        self.attention = nn.Sequential(
            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
            nn.Softmax(dim=2),
            )

        if self.encoder_type == "SAP":
            out_dim = num_filters[3] * outmap_size
        elif self.encoder_type == "ASP":
            out_dim = num_filters[3] * outmap_size * 2
        else:
            raise ValueError('Undefined encoder')

        self.fc = nn.Linear(out_dim, nOut)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
Ejemplo n.º 2
0
    def __init__(self,
                 channels=512,
                 embd_dim=192,
                 n_mels=40,
                 log_input=True,
                 **kwargs):
        super().__init__()
        self.layer1 = Conv1dReluBn(n_mels, channels, kernel_size=5, padding=2)
        self.layer2 = SE_Res2Block(channels,
                                   kernel_size=3,
                                   stride=1,
                                   padding=2,
                                   dilation=2,
                                   scale=8)
        self.layer3 = SE_Res2Block(channels,
                                   kernel_size=3,
                                   stride=1,
                                   padding=3,
                                   dilation=3,
                                   scale=8)
        self.layer4 = SE_Res2Block(channels,
                                   kernel_size=3,
                                   stride=1,
                                   padding=4,
                                   dilation=4,
                                   scale=8)

        cat_channels = channels * 3
        self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1)
        self.pooling = AttentiveStatsPool(cat_channels, 128)
        self.bn1 = nn.BatchNorm1d(cat_channels * 2)
        self.linear = nn.Linear(cat_channels * 2, embd_dim)
        self.bn2 = nn.BatchNorm1d(embd_dim)

        self.torchfb = torch.nn.Sequential(
            PreEmphasis(),
            torchaudio.transforms.MelSpectrogram(
                sample_rate=16000,
                n_fft=512,
                win_length=400,
                hop_length=160,
                window_fn=torch.hamming_window,
                n_mels=n_mels))
        self.log_input = log_input
        self.instancenorm = nn.InstanceNorm1d(n_mels)
Ejemplo n.º 3
0
    def __init__(self,
                 block,
                 C,
                 model_scale,
                 nOut,
                 n_mels,
                 log_input,
                 encoder_type,
                 context=False,
                 summed=False,
                 out_bn=False,
                 **kwargs):

        self.context = context
        self.summed = summed
        self.n_mfcc = n_mels
        self.log_input = log_input
        self.encoder_type = encoder_type
        self.out_bn = out_bn

        super(Res2Net, self).__init__()
        self.scale = model_scale

        self.conv1 = nn.Conv1d(self.n_mfcc,
                               C,
                               kernel_size=5,
                               stride=1,
                               padding=2)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(C)

        self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale)
        self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale)
        self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale)
        self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1)

        self.instancenorm = nn.InstanceNorm1d(self.n_mfcc)
        self.torchmfcc = torch.nn.Sequential(
            PreEmphasis(),
            torchaudio.transforms.MFCC(sample_rate=16000,
                                       n_mfcc=self.n_mfcc,
                                       log_mels=self.log_input,
                                       dct_type=2,
                                       melkwargs={
                                           'n_mels': 80,
                                           'n_fft': 512,
                                           'win_length': 400,
                                           'hop_length': 160,
                                           'f_min': 20,
                                           'f_max': 7600,
                                           'window_fn': torch.hamming_window
                                       }),
        )

        if self.context:
            attn_input = 1536 * 3
        else:
            attn_input = 1536

        if self.encoder_type == 'ECA':
            attn_output = 1536
        elif self.encoder_type == 'ASP':
            attn_output = 1
        else:
            raise ValueError('Undefined encoder')

        self.attention = nn.Sequential(
            nn.Conv1d(attn_input, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, attn_output, kernel_size=1),
            nn.Softmax(dim=2),
        )

        self.bn5 = nn.BatchNorm1d(3072)

        self.fc6 = nn.Linear(3072, nOut)
        self.bn6 = nn.BatchNorm1d(nOut)
Ejemplo n.º 4
0
    def __init__(self,
                 nOut,
                 n_mels=40,
                 log_input=True,
                 encoder_type='SAP',
                 **kwargs):
        super(X_vector, self).__init__()

        self.log_input = log_input
        self.n_mels = n_mels
        self.encoder_type = encoder_type
        self.instancenorm = nn.InstanceNorm1d(n_mels)
        self.stdim = 512
        self.statsdim = 1500

        self.torchfb = torch.nn.Sequential(
            PreEmphasis(),
            torchaudio.transforms.MelSpectrogram(
                sample_rate=16000,
                n_fft=512,
                win_length=400,
                hop_length=160,
                window_fn=torch.hamming_window,
                n_mels=n_mels))

        if self.encoder_type == "SAP":
            out_dim = self.statsdim
        elif self.encoder_type == "ASP":
            out_dim = self.statsdim * 2
        else:
            raise ValueError('Undefined encoder')

        self.tdnn1 = TDNN(input_dim=n_mels,
                          output_dim=self.stdim,
                          context_size=5,
                          dilation=1,
                          dropout_p=0.5)
        self.tdnn2 = TDNN(input_dim=self.stdim,
                          output_dim=self.stdim,
                          context_size=5,
                          dilation=1,
                          dropout_p=0.5)
        self.tdnn3 = TDNN(input_dim=self.stdim,
                          output_dim=self.stdim,
                          context_size=7,
                          dilation=2,
                          dropout_p=0.5)

        self.dense1 = nn.Linear(self.stdim, self.stdim)
        self.dense2 = nn.Linear(self.stdim, out_dim)
        self.dense3 = nn.Linear(out_dim, self.stdim)  #x-vector
        self.dense4 = nn.Linear(self.stdim, self.stdim)  #x-vector
        self.output = nn.Linear(self.stdim, nOut)

        self.nonlinearity = nn.ReLU()

        self.attention = nn.Sequential(
            nn.Conv1d(out_dim, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, out_dim, kernel_size=1),
            nn.Softmax(dim=2),
        )