def __init__(self, input_dim, output_dim, up_sample, \ blstm_s = 64, cnn_kernel_s = 3): """ CondModule(input_dim, output_dim, up_sample, blstm_s=64, cnn_kernel_s=3) Args ---- input_dim: int, input tensor should be (batchsize, len1, input_dim) output_dim: int, output tensor will be (batchsize, len2, output_dim) up_sample: int, up-sampling rate, len2 = len1 * up_sample blstm_s: int, layer size of the Bi-LSTM layer cnn_kernel_s: int, kernel size of the conv1d """ super(CondModule, self).__init__() # configurations self.input_dim = input_dim self.output_dim = output_dim self.up_sample = up_sample self.blstm_s = blstm_s self.cnn_kernel_s = cnn_kernel_s # layers self.l_blstm = nii_nn.BLSTMLayer(input_dim, self.blstm_s) self.l_conv1d = nii_nn.Conv1dKeepLength( self.blstm_s, output_dim, 1, self.cnn_kernel_s) self.l_upsamp = nii_nn.UpSampleLayer( self.output_dim, self.up_sample, True)
def __init__(self, in_dim, out_dim, args, mean_std=None): super(Model, self).__init__() ##### required part, no need to change ##### # mean std of input and output in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\ args, mean_std) self.input_mean = torch_nn.Parameter(in_m, requires_grad=False) self.input_std = torch_nn.Parameter(in_s, requires_grad=False) self.output_mean = torch_nn.Parameter(out_m, requires_grad=False) self.output_std = torch_nn.Parameter(out_s, requires_grad=False) # a flag for debugging (by default False) #self.model_debug = False #self.validation = False ##### #### # on input waveform and output target #### # Load protocol and prepare the target data for network training protocol_file = prj_conf.optional_argument[0] self.protocol_parser = protocol_parse(protocol_file) # Working sampling rate # torchaudio may be used to change sampling rate self.m_target_sr = 16000 #### # optional configs (not used) #### # re-sampling (optional) #self.m_resampler = torchaudio.transforms.Resample( # prj_conf.wav_samp_rate, self.m_target_sr) # vad (optional) #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr) # flag for balanced class (temporary use) #self.v_flag = 1 #### # front-end configuration # multiple front-end configurations may be used # by default, use a single front-end #### # frame shift (number of waveform points) self.frame_hops = [160] # frame length self.frame_lens = [320] # FFT length self.fft_n = [512] # self.spec_with_delta = False self.spec_fb_dim = 60 # window type self.win = torch.hann_window # floor in log-spectrum-amplitude calculating (not used) self.amp_floor = 0.00001 # number of frames to be kept for each trial # no truncation self.v_truncate_lens = [None for x in self.frame_hops] # number of sub-models (by default, a single model) self.v_submodels = len(self.frame_lens) # dimension of embedding vectors self.v_emd_dim = 64 # output classes self.v_out_class = 1 #### # create network #### # 1st part of the classifier self.m_transform = [] # self.m_before_pooling = [] # 2nd part of the classifier self.m_output_act = [] # front-end self.m_frontend = [] # final part on training self.m_angle = [] # it can handle models with multiple front-end configuration # by default, only a single front-end for idx, (trunc_len, fft_n) in enumerate(zip( self.v_truncate_lens, self.fft_n)): fft_n_bins = fft_n // 2 + 1 self.m_transform.append( torch_nn.Sequential( TrainableLinearFb(fft_n,self.m_target_sr,self.spec_fb_dim), torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(48, affine=False), torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(64, affine=False), torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]), nii_nn.MaxFeatureMap2D(), torch_nn.BatchNorm2d(32, affine=False), torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]), nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2], [2, 2]), torch_nn.Dropout(0.7) ) ) self.m_before_pooling.append( torch_nn.Sequential( nii_nn.BLSTMLayer((self.spec_fb_dim//16) * 32, (self.spec_fb_dim//16) * 32), nii_nn.BLSTMLayer((self.spec_fb_dim//16) * 32, (self.spec_fb_dim//16) * 32) ) ) self.m_output_act.append( torch_nn.Linear((self.spec_fb_dim // 16) * 32, self.v_emd_dim) ) self.m_angle.append( nii_ocsoftmax.OCAngleLayer(self.v_emd_dim) ) self.m_frontend.append( nii_front_end.Spectrogram(self.frame_lens[idx], self.frame_hops[idx], self.fft_n[idx], self.m_target_sr) ) self.m_frontend = torch_nn.ModuleList(self.m_frontend) self.m_transform = torch_nn.ModuleList(self.m_transform) self.m_output_act = torch_nn.ModuleList(self.m_output_act) self.m_angle = torch_nn.ModuleList(self.m_angle) self.m_before_pooling = torch_nn.ModuleList(self.m_before_pooling) # done return