def __init__(self, block, layers, num_filters, nOut, encoder_type='SAP', n_mels=40, log_input=True, **kwargs): super(ResNetSE, self).__init__() print('Embedding size is %d, encoder %s.'%(nOut, encoder_type)) self.inplanes = num_filters[0] self.encoder_type = encoder_type self.n_mels = n_mels self.log_input = log_input self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=1, padding=1) self.relu = nn.ReLU(inplace=True) self.bn1 = nn.BatchNorm2d(num_filters[0]) self.layer1 = self._make_layer(block, num_filters[0], layers[0]) self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2)) self.instancenorm = nn.InstanceNorm1d(n_mels) self.torchfb = torch.nn.Sequential( PreEmphasis(), torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels) ) outmap_size = int(self.n_mels/8) self.attention = nn.Sequential( nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), nn.ReLU(), nn.BatchNorm1d(128), nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), nn.Softmax(dim=2), ) if self.encoder_type == "SAP": out_dim = num_filters[3] * outmap_size elif self.encoder_type == "ASP": out_dim = num_filters[3] * outmap_size * 2 else: raise ValueError('Undefined encoder') self.fc = nn.Linear(out_dim, nOut) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0)
def __init__(self, channels=512, embd_dim=192, n_mels=40, log_input=True, **kwargs): super().__init__() self.layer1 = Conv1dReluBn(n_mels, channels, kernel_size=5, padding=2) self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8) self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8) self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8) cat_channels = channels * 3 self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1) self.pooling = AttentiveStatsPool(cat_channels, 128) self.bn1 = nn.BatchNorm1d(cat_channels * 2) self.linear = nn.Linear(cat_channels * 2, embd_dim) self.bn2 = nn.BatchNorm1d(embd_dim) self.torchfb = torch.nn.Sequential( PreEmphasis(), torchaudio.transforms.MelSpectrogram( sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels)) self.log_input = log_input self.instancenorm = nn.InstanceNorm1d(n_mels)
def __init__(self, block, C, model_scale, nOut, n_mels, log_input, encoder_type, context=False, summed=False, out_bn=False, **kwargs): self.context = context self.summed = summed self.n_mfcc = n_mels self.log_input = log_input self.encoder_type = encoder_type self.out_bn = out_bn super(Res2Net, self).__init__() self.scale = model_scale self.conv1 = nn.Conv1d(self.n_mfcc, C, kernel_size=5, stride=1, padding=2) self.relu = nn.ReLU() self.bn1 = nn.BatchNorm1d(C) self.layer1 = block(C, C, kernel_size=3, dilation=2, scale=self.scale) self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=self.scale) self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=self.scale) self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1) self.instancenorm = nn.InstanceNorm1d(self.n_mfcc) self.torchmfcc = torch.nn.Sequential( PreEmphasis(), torchaudio.transforms.MFCC(sample_rate=16000, n_mfcc=self.n_mfcc, log_mels=self.log_input, dct_type=2, melkwargs={ 'n_mels': 80, 'n_fft': 512, 'win_length': 400, 'hop_length': 160, 'f_min': 20, 'f_max': 7600, 'window_fn': torch.hamming_window }), ) if self.context: attn_input = 1536 * 3 else: attn_input = 1536 if self.encoder_type == 'ECA': attn_output = 1536 elif self.encoder_type == 'ASP': attn_output = 1 else: raise ValueError('Undefined encoder') self.attention = nn.Sequential( nn.Conv1d(attn_input, 128, kernel_size=1), nn.ReLU(), nn.BatchNorm1d(128), nn.Conv1d(128, attn_output, kernel_size=1), nn.Softmax(dim=2), ) self.bn5 = nn.BatchNorm1d(3072) self.fc6 = nn.Linear(3072, nOut) self.bn6 = nn.BatchNorm1d(nOut)
def __init__(self, nOut, n_mels=40, log_input=True, encoder_type='SAP', **kwargs): super(X_vector, self).__init__() self.log_input = log_input self.n_mels = n_mels self.encoder_type = encoder_type self.instancenorm = nn.InstanceNorm1d(n_mels) self.stdim = 512 self.statsdim = 1500 self.torchfb = torch.nn.Sequential( PreEmphasis(), torchaudio.transforms.MelSpectrogram( sample_rate=16000, n_fft=512, win_length=400, hop_length=160, window_fn=torch.hamming_window, n_mels=n_mels)) if self.encoder_type == "SAP": out_dim = self.statsdim elif self.encoder_type == "ASP": out_dim = self.statsdim * 2 else: raise ValueError('Undefined encoder') self.tdnn1 = TDNN(input_dim=n_mels, output_dim=self.stdim, context_size=5, dilation=1, dropout_p=0.5) self.tdnn2 = TDNN(input_dim=self.stdim, output_dim=self.stdim, context_size=5, dilation=1, dropout_p=0.5) self.tdnn3 = TDNN(input_dim=self.stdim, output_dim=self.stdim, context_size=7, dilation=2, dropout_p=0.5) self.dense1 = nn.Linear(self.stdim, self.stdim) self.dense2 = nn.Linear(self.stdim, out_dim) self.dense3 = nn.Linear(out_dim, self.stdim) #x-vector self.dense4 = nn.Linear(self.stdim, self.stdim) #x-vector self.output = nn.Linear(self.stdim, nOut) self.nonlinearity = nn.ReLU() self.attention = nn.Sequential( nn.Conv1d(out_dim, 128, kernel_size=1), nn.ReLU(), nn.BatchNorm1d(128), nn.Conv1d(128, out_dim, kernel_size=1), nn.Softmax(dim=2), )