def __init__(self, rnn_layers=2, rnn_units=128, win_len=400, win_inc=100, fft_len=512, win_type='hanning', mode='TCS'): super(Net, self).__init__() # for fft self.win_len = win_len self.win_inc = win_inc self.fft_len = fft_len self.win_type = win_type input_dim = win_len output_dim = win_len self.rnn_units = rnn_units self.input_dim = input_dim self.output_dim = output_dim self.hidden_layers = rnn_layers fix = True self.fix = fix self.stft = ConvSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'complex', fix=fix) self.istft = ConviSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'complex', fix=fix) in_dim = self.fft_len // 2 out_dim = self.fft_len // 2 self.clp = ComplexLinearProjection(in_dim) self.bn = nn.BatchNorm1d(in_dim, affine=False) self.fsmn1 = DFSMN(in_dim, rnn_units, out_dim, 3, 3, 1, 1) self.fsmn2 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn3 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn4 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn5 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn6 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn7 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn8 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn9 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn10 = DFSMN(in_dim, rnn_units, out_dim * 2, 3, 3, 0, 1) self.df = DeepFilter(1, 2) show_params(self)
def __init__(self, spk_layers=14, sep_stack_size=10, sep_stack_num=4, latent_dim=512, numspks=2, overallspks=3, with_filter=True): super(WaveSplit, self).__init__() self.with_filter = with_filter if with_filter: self.encoder = nn.Conv1d( 1, latent_dim, kernel_size=16, stride=8, bias=False, ) nn.init.normal_(self.encoder.weight) self.decoder = nn.ConvTranspose1d( latent_dim, 1, kernel_size=16, stride=8, bias=False, ) nn.init.normal_(self.decoder.weight) self.speaker = SpeakerStack(in_dim=latent_dim, out_dim=latent_dim, num_layers=spk_layers, numspks=numspks, latent_dim=latent_dim) self.separation = SeparationStack(in_dim=latent_dim, out_dim=latent_dim, num_stack=sep_stack_num, stack_size=sep_stack_size, latent_dim=latent_dim) else: self.speaker = SpeakerStack(in_dim=1, out_dim=latent_dim, num_layers=spk_layers, numspks=numspks, latent_dim=latent_dim) self.separation = SeparationStack(in_dim=1, out_dim=1, num_stack=sep_stack_num, stack_size=sep_stack_size, latent_dim=latent_dim) self.kmean = KMeans(1, numspks, latent_dim, iter_nums=80) self.numspks = numspks self.loss_func = SpeakerLoss(latent_dim, numspks=numspks, overallspks=overallspks) self.latent_dim = latent_dim show_params(self)
def __init__(self, rnn_layers=2, rnn_units=128, win_len=400, win_inc=100, fft_len=512, win_type='hanning', mode='TCS'): super(Net, self).__init__() # for fft self.win_len = win_len self.win_inc = win_inc self.fft_len = fft_len self.win_type = win_type input_dim = win_len output_dim = win_len self.rnn_units = rnn_units self.input_dim = input_dim self.output_dim = output_dim self.hidden_layers = rnn_layers fix = True self.fix = fix self.stft = ConvSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'real', fix=fix) self.istft = ConviSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'real', fix=fix) in_dim = self.fft_len // 2 out_dim = self.fft_len // 2 self.fsmn1 = DFSMN(in_dim, rnn_units, out_dim, 3, 3, 1, 1) self.fsmn2 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn3 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn4 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn5 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn6 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 1, 1) self.fsmn7 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn8 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn9 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn10 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn11 = DFSMN(in_dim, rnn_units, out_dim, 7, 3, 0, 1) self.fsmn12 = DFSMN(in_dim, rnn_units, out_dim, 3, 3, 0, 1) show_params(self)
def __init__(self, input_dim=257, output_dim=257, hidden_layers=2, hidden_units=512, left_context=1, right_context=1, kernel_size=6, kernel_num=9, dropout=0.2): super(CLDNN, self).__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_layers = hidden_layers self.hidden_units = hidden_units self.left_context = left_context self.right_context = right_context self.kernel_size = kernel_size self.kernel_sum = kernel_num super(CLDNN, self).__init__() self.input_layer = nn.Sequential( nn.Linear((left_context + 1 + right_context) * input_dim, hidden_units), nn.Tanh()) self.rnn_layer = nn.GRU( input_size=hidden_units, hidden_size=hidden_units, num_layers=hidden_layers, dropout=dropout, ) self.conv2d_layer = nn.Sequential( Conv2d(in_channels=1, out_channels=kernel_num, kernel_size=(kernel_size, kernel_size)), nn.Tanh(), nn.MaxPool2d(3, stride=1, padding=(1, 1))) self.output_layer = nn.Sequential( nn.Linear(hidden_units * kernel_num, (left_context + 1 + right_context) * self.output_dim), nn.Sigmoid()) self.loss_func = nn.MSELoss(reduction='sum') #self.loss_func = nn.MSELoss() show_model(self) show_params(self)
def __init__( self, rnn_layers=2, rnn_units=128, win_len=400, win_inc=100, fft_len=512, win_type='hanning', masking_mode='E', use_clstm=False, use_cbn = False, kernel_size=5, kernel_num=[16,32,64,128,256,256] ): ''' rnn_layers: the number of lstm layers in the crn, rnn_units: for clstm, rnn_units = real+imag ''' super(DCCRN, self).__init__() # for fft self.win_len = win_len self.win_inc = win_inc self.fft_len = fft_len self.win_type = win_type input_dim = win_len output_dim = win_len self.rnn_units = rnn_units self.input_dim = input_dim self.output_dim = output_dim self.hidden_layers = rnn_layers self.kernel_size = kernel_size #self.kernel_num = [2, 8, 16, 32, 128, 128, 128] #self.kernel_num = [2, 16, 32, 64, 128, 256, 256] self.kernel_num = [2]+kernel_num self.masking_mode = masking_mode self.use_clstm = use_clstm #bidirectional=True bidirectional=False fac = 2 if bidirectional else 1 fix=True self.fix = fix self.stft = ConvSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'complex', fix=fix) self.istft = ConviSTFT(self.win_len, self.win_inc, fft_len, self.win_type, 'complex', fix=fix) self.encoder = nn.ModuleList() self.decoder = nn.ModuleList() for idx in range(len(self.kernel_num)-1): self.encoder.append( nn.Sequential( #nn.ConstantPad2d([0, 0, 0, 0], 0), ComplexConv2d( self.kernel_num[idx], self.kernel_num[idx+1], kernel_size=(self.kernel_size, 2), stride=(2, 1), padding=(2, 1) ), nn.BatchNorm2d(self.kernel_num[idx+1]) if not use_cbn else ComplexBatchNorm(self.kernel_num[idx+1]), nn.PReLU() ) ) hidden_dim = self.fft_len//(2**(len(self.kernel_num))) if self.use_clstm: rnns = [] for idx in range(rnn_layers): rnns.append( NavieComplexLSTM( input_size= hidden_dim*self.kernel_num[-1] if idx == 0 else self.rnn_units, hidden_size=self.rnn_units, bidirectional=bidirectional, batch_first=False, projection_dim= hidden_dim*self.kernel_num[-1] if idx == rnn_layers-1 else None, ) ) self.enhance = nn.Sequential(*rnns) else: self.enhance = nn.LSTM( input_size= hidden_dim*self.kernel_num[-1], hidden_size=self.rnn_units, num_layers=2, dropout=0.0, bidirectional=bidirectional, batch_first=False ) self.tranform = nn.Linear(self.rnn_units * fac, hidden_dim*self.kernel_num[-1]) for idx in range(len(self.kernel_num)-1, 0, -1): if idx != 1: self.decoder.append( nn.Sequential( ComplexConvTranspose2d( self.kernel_num[idx]*2, self.kernel_num[idx-1], kernel_size =(self.kernel_size, 2), stride=(2, 1), padding=(2,0), output_padding=(1,0) ), nn.BatchNorm2d(self.kernel_num[idx-1]) if not use_cbn else ComplexBatchNorm(self.kernel_num[idx-1]), #nn.ELU() nn.PReLU() ) ) else: self.decoder.append( nn.Sequential( ComplexConvTranspose2d( self.kernel_num[idx]*2, self.kernel_num[idx-1], kernel_size =(self.kernel_size, 2), stride=(2, 1), padding=(2,0), output_padding=(1,0) ), ) ) show_model(self) show_params(self) self.flatten_parameters()
def __init__(self, win_len=400, win_inc=100, fft_len=512, win_type='hanning', num_blocks=3, channel_amp=9, channel_phase=8, rnn_nums=300): super(PHASEN, self).__init__() self.num_blocks = 3 self.feat_dim = fft_len // 2 + 1 self.win_len = win_len self.win_inc = win_inc self.fft_len = fft_len self.win_type = win_type fix = True self.stft = ConvSTFT(self.win_len, self.win_inc, self.fft_len, self.win_type, feature_type='complex', fix=fix) self.istft = ConviSTFT(self.win_len, self.win_inc, self.fft_len, self.win_type, feature_type='complex', fix=fix) self.amp_conv1 = nn.Sequential( nn.Conv2d(2, channel_amp, kernel_size=[7, 1], padding=(3, 0)), nn.BatchNorm2d(channel_amp), nn.ReLU(), nn.Conv2d(channel_amp, channel_amp, kernel_size=[1, 7], padding=(0, 3)), nn.BatchNorm2d(channel_amp), nn.ReLU(), ) self.phase_conv1 = nn.Sequential( nn.Conv2d(2, channel_phase, kernel_size=[3, 5], padding=(1, 2)), nn.Conv2d(channel_phase, channel_phase, kernel_size=[3, 25], padding=(1, 12)), ) self.tsbs = nn.ModuleList() for idx in range(self.num_blocks): self.tsbs.append( TSB(input_dim=self.feat_dim, channel_amp=channel_amp, channel_phase=channel_phase)) self.amp_conv2 = nn.Sequential( nn.Conv2d(channel_amp, 8, kernel_size=[1, 1]), nn.BatchNorm2d(8), nn.ReLU(), ) self.phase_conv2 = nn.Sequential( nn.Conv1d(channel_phase, 2, kernel_size=[1, 1])) self.rnn = nn.GRU(self.feat_dim * 8, rnn_nums, bidirectional=True) self.fcs = nn.Sequential(nn.Linear(rnn_nums * 2, 600), nn.ReLU(), nn.Linear(600, 600), nn.ReLU(), nn.Linear(600, 514), nn.Sigmoid()) show_params(self)