def __init__(self, in_dim, out_dim, args, mean_std=None): super(Model, self).__init__() ##### required part, no need to change ##### # mean std of input and output in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\ args, mean_std) self.input_mean = torch_nn.Parameter(in_m, requires_grad=False) self.input_std = torch_nn.Parameter(in_s, requires_grad=False) self.output_mean = torch_nn.Parameter(out_m, requires_grad=False) self.output_std = torch_nn.Parameter(out_s, requires_grad=False) # a flag for debugging (by default False) self.model_debug = False self.flag_validation = False ##### #### # on input waveform and output target #### # Load protocol and prepare the target data for network training protocol_file = prj_conf.optional_argument[0] self.protocol_parser = protocol_parse(protocol_file) # Working sampling rate # torchaudio may be used to change sampling rate self.m_target_sr = 16000 #### # optional configs (not used) #### # re-sampling (optional) self.m_resampler = torchaudio.transforms.Resample( prj_conf.wav_samp_rate, self.m_target_sr) # vad (optional) self.m_vad = torchaudio.transforms.Vad(sample_rate=self.m_target_sr) # flag for balanced class (temporary use) self.v_flag = 1 #### # front-end configuration # multiple front-end configurations may be used # by default, use a single front-end #### # frame shift (number of waveform points) self.frame_hops = [160] # frame length self.frame_lens = [320] # FFT length self.fft_n = [512] # LFCC dim (base component) self.lfcc_dim = [20] self.lfcc_with_delta = True # window type self.win = torch.hann_window # floor in log-spectrum-amplitude calculating (not used) self.amp_floor = 0.00001 # number of frames to be kept for each trial # 750 frames are quite long for ASVspoof2019 LA with frame_shift = 10ms self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops] # number of sub-models (by default, a single model) self.v_submodels = len(self.frame_lens) # dimension of embedding vectors, which will be into to oc-softmax layer self.v_emd_dim = 256 # output class (1 for one-class softmax) self.v_out_class = 1 #### # create network #### # backend self.m_model = [] # fronend self.m_frontend = [] # softmax layer for backend self.m_a_softmax = [] for idx, (trunc_len, fft_n, lfcc_dim) in enumerate( zip(self.v_truncate_lens, self.fft_n, self.lfcc_dim)): fft_n_bins = fft_n // 2 + 1 if self.lfcc_with_delta: lfcc_dim = lfcc_dim * 3 self.m_model.append(nii_resnet.ResNet(self.v_emd_dim)) self.m_frontend.append( nii_front_end.LFCC(self.frame_lens[idx], self.frame_hops[idx], self.fft_n[idx], self.m_target_sr, self.lfcc_dim[idx], with_energy=True)) self.m_a_softmax.append(nii_oc_softmax.OCAngleLayer( self.v_emd_dim)) self.m_model = torch_nn.ModuleList(self.m_model) self.m_frontend = torch_nn.ModuleList(self.m_frontend) self.m_a_softmax = torch_nn.ModuleList(self.m_a_softmax) # output # done return
def __init__(self, in_dim, out_dim, args, mean_std=None): super(Model, self).__init__() ##### required part, no need to change ##### # mean std of input and output in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\ args, mean_std) self.input_mean = torch_nn.Parameter(in_m, requires_grad=False) self.input_std = torch_nn.Parameter(in_s, requires_grad=False) self.output_mean = torch_nn.Parameter(out_m, requires_grad=False) self.output_std = torch_nn.Parameter(out_s, requires_grad=False) # a flag for debugging (by default False) #self.model_debug = False #self.validation = False ##### #### # on input waveform and output target #### # Load protocol and prepare the target data for network training protocol_file = prj_conf.optional_argument[0] self.protocol_parser = protocol_parse(protocol_file) # Working sampling rate # torchaudio may be used to change sampling rate self.m_target_sr = 16000 #### # optional configs (not used) #### # re-sampling (optional) #self.m_resampler = torchaudio.transforms.Resample( # prj_conf.wav_samp_rate, self.m_target_sr) # vad (optional) #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr) # flag for balanced class (temporary use) #self.v_flag = 1 #### # front-end configuration # multiple front-end configurations may be used # by default, use a single front-end #### # frame shift (number of waveform points) self.frame_hops = [160] # frame length self.frame_lens = [320] # FFT length self.fft_n = [512] # self.spec_with_delta = False self.spec_fb_dim = 60 # window type self.win = torch.hann_window # floor in log-spectrum-amplitude calculating (not used) self.amp_floor = 0.00001 # number of frames to be kept for each trial # no truncation self.v_truncate_lens = [None for x in self.frame_hops] # number of sub-models (by default, a single model) self.v_submodels = len(self.frame_lens) # dimension of embedding vectors self.v_emd_dim = 64 # output classes self.v_out_class = 1 #### # create network #### # 1st part of the classifier self.m_transform = [] # self.m_before_pooling = [] # 2nd part of the classifier self.m_output_act = [] # front-end self.m_frontend = [] # final part on training self.m_angle = [] # it can handle models with multiple front-end configuration # by default, only a single front-end for idx, (trunc_len, fft_n) in enumerate(zip( self.v_truncate_lens, self.fft_n)): fft_n_bins = fft_n // 2 + 1 self.m_transform.append( torch_nn.Sequential( TrainableLinearFb(fft_n,self.m_target_sr,self.spec_fb_dim), torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(64, affine=False), torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Dropout(0.7) ) ) self.m_before_pooling.append( torch_nn.Sequential( nii_nn.BLSTMLayer((self.spec_fb_dim//16) * 32, (self.spec_fb_dim//16) * 32), nii_nn.BLSTMLayer((self.spec_fb_dim//16) * 32, (self.spec_fb_dim//16) * 32) ) ) self.m_output_act.append( torch_nn.Linear((self.spec_fb_dim // 16) * 32, self.v_emd_dim) ) self.m_angle.append( nii_ocsoftmax.OCAngleLayer(self.v_emd_dim) ) self.m_frontend.append( nii_front_end.Spectrogram(self.frame_lens[idx], self.frame_hops[idx], self.fft_n[idx], self.m_target_sr) ) self.m_frontend = torch_nn.ModuleList(self.m_frontend) self.m_transform = torch_nn.ModuleList(self.m_transform) self.m_output_act = torch_nn.ModuleList(self.m_output_act) self.m_angle = torch_nn.ModuleList(self.m_angle) self.m_before_pooling = torch_nn.ModuleList(self.m_before_pooling) # done return
def __init__(self, in_dim, out_dim, args, mean_std=None): super(Model, self).__init__() ##### required part, no need to change ##### # mean std of input and output in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\ args, mean_std) self.input_mean = torch_nn.Parameter(in_m, requires_grad=False) self.input_std = torch_nn.Parameter(in_s, requires_grad=False) self.output_mean = torch_nn.Parameter(out_m, requires_grad=False) self.output_std = torch_nn.Parameter(out_s, requires_grad=False) # a flag for debugging (by default False) #self.model_debug = False #self.validation = False ##### #### # on input waveform and output target #### # Load protocol and prepare the target data for network training # target data protocol_file = prj_conf.optional_argument[0] self.protocol_parser = protocol_parse(protocol_file) # working sampling rate, torchaudio is used to change sampling rate self.m_target_sr = 16000 #### # optional configs (not used) #### # re-sampling (optional) #self.m_resampler = torchaudio.transforms.Resample( # prj_conf.wav_samp_rate, self.m_target_sr) # vad (optional) #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr) # flag for balanced class (temporary use) #self.v_flag = 1 #### # front-end configuration # multiple front-end configurations may be used # by default, use a single front-end #### # frame shift (number of points) self.frame_hops = [160] # frame length self.frame_lens = [320] # FFT length self.fft_n = [512] # LFB dim (base component) self.lfb_dim = [60] self.lfb_with_delta = False # window type self.win = torch.hann_window # floor in log-spectrum-amplitude calculating self.amp_floor = 0.00001 # manual choose the first 600 frames in the data self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops] # number of sub-models self.v_submodels = len(self.frame_lens) # dimension of embedding vectors self.v_emd_dim = 256 # output class self.v_out_class = 1 self.m_transform = [] self.m_output_act = [] self.m_frontend = [] self.m_a_softmax = [] for idx, (trunc_len, fft_n, lfb_dim) in enumerate( zip(self.v_truncate_lens, self.fft_n, self.lfb_dim)): fft_n_bins = fft_n // 2 + 1 if self.lfb_with_delta: lfb_dim = lfb_dim * 3 self.m_transform.append( torch_nn.Sequential( torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(64, affine=False), torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2], [2, 2]))) self.m_output_act.append( torch_nn.Sequential( torch_nn.Dropout(0.7), torch_nn.Linear((trunc_len // 16) * (lfb_dim // 16) * 32, 512), nii_nn.MaxFeatureMap2D(), torch_nn.Linear(256, self.v_emd_dim))) self.m_frontend.append( nii_front_end.LFB(self.frame_lens[idx], self.frame_hops[idx], self.fft_n[idx], self.m_target_sr, self.lfb_dim[idx], with_energy=False, with_emphasis=True, with_delta=self.lfb_with_delta)) self.m_a_softmax.append(nii_oc_softmax.OCAngleLayer( self.v_emd_dim)) self.m_transform = torch_nn.ModuleList(self.m_transform) self.m_output_act = torch_nn.ModuleList(self.m_output_act) self.m_frontend = torch_nn.ModuleList(self.m_frontend) self.m_a_softmax = torch_nn.ModuleList(self.m_a_softmax) # output # done return