def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=''): super(seq2seq, self).__init__() if pretrain is not None: src_embedding = pretrain['src_emb'] tgt_embedding = pretrain['tgt_emb'] else: src_embedding = None tgt_embedding = None self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding) if config.shared_vocab == False: self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn) else: self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding, score_fn=score_fn) self.use_cuda = use_cuda self.tgt_vocab_size = tgt_vocab_size self.config = config self.criterion = models.criterion(tgt_vocab_size, use_cuda,config.loss) self.loss_for_ss = nn.MSELoss() self.log_softmax = nn.LogSoftmax() self.wav_loss = models.WaveLoss(dBscale=1, nfft=config.FRAME_LENGTH, hop_size=config.FRAME_SHIFT) speech_fre = input_emb_size num_labels = tgt_vocab_size if config.use_tas: self.ss_model = models.ConvTasNet() else: self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)
def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=None): super(seq2seq, self).__init__() if pretrain is not None: src_embedding = pretrain['src_emb'] tgt_embedding = pretrain['tgt_emb'] else: src_embedding = None tgt_embedding = None self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding) if config.shared_vocab == False: self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn) else: self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding, score_fn=score_fn) self.use_cuda = use_cuda self.tgt_vocab_size = tgt_vocab_size self.config = config self.criterion = models.criterion(tgt_vocab_size, use_cuda) self.loss_for_ss = nn.MSELoss() self.log_softmax = nn.LogSoftmax() speech_fre = input_emb_size num_labels = tgt_vocab_size self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)
def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=''): super(seq2seq_music, self).__init__() if pretrain is not None: src_embedding = pretrain['src_emb'] tgt_embedding = pretrain['tgt_emb'] else: src_embedding = None tgt_embedding = None self.use_cuda = use_cuda self.tgt_vocab_size = tgt_vocab_size self.config = config self.criterion = models.criterion(tgt_vocab_size, use_cuda,config.loss) self.loss_for_ss = nn.MSELoss() self.log_softmax = nn.LogSoftmax() self.wav_loss = models.WaveLoss(dBscale=1, nfft=config.FRAME_LENGTH, hop_size=config.FRAME_SHIFT) speech_fre = input_emb_size num_labels = tgt_vocab_size if config.use_tas: if self.config.use_dprnn: self.ss_model = models.FaSNet_base(config) self.spk_lstm = nn.LSTMCell(self.ss_model.B + self.ss_model.N, self.ss_model.B) # LSTM over the speakers' step. else: self.ss_model = models.ConvTasNet_music(config) if self.config.two_stage: self.second_ss_model = models.ConvTasNet_2nd(config) for p in self.encoder.parameters(): p.requires_grad = False for p in self.decoder.parameters(): p.requires_grad = False for p in self.ss_model.parameters(): p.requires_grad = False self.spk_lstm = nn.LSTMCell(self.ss_model.B + self.ss_model.N, self.ss_model.B) # LSTM over the speakers' step. else: # self.ss_model = models.SS_att(config, speech_fre, mix_speech_len, num_labels) self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)
def __init__(self, config, input_emb_size, mix_speech_len, tgt_vocab_size, use_cuda, pretrain=None, score_fn=''): super(seq2seq, self).__init__() if pretrain is not None: src_embedding = pretrain['src_emb'] tgt_embedding = pretrain['tgt_emb'] else: src_embedding = None tgt_embedding = None # self.encoder = models.rnn_encoder(config, input_emb_size, None, embedding=src_embedding) self.encoder = models.TransEncoder(config, input_emb_size) self.decoder = models.TransDecoder(config, sos_id=0, eos_id=tgt_vocab_size - 1, n_tgt_vocab=tgt_vocab_size) # if config.shared_vocab == False: # self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=tgt_embedding, score_fn=score_fn) # else: # self.decoder = models.rnn_decoder(config, tgt_vocab_size, embedding=self.encoder.embedding, # score_fn=score_fn) self.use_cuda = use_cuda self.tgt_vocab_size = tgt_vocab_size self.config = config self.criterion = models.criterion(tgt_vocab_size, use_cuda, config.loss) self.loss_for_ss = nn.MSELoss() self.log_softmax = nn.LogSoftmax() self.wav_loss = models.WaveLoss(dBscale=1, nfft=config.FRAME_LENGTH, hop_size=config.FRAME_SHIFT) speech_fre = input_emb_size num_labels = tgt_vocab_size if config.use_tas: if self.config.use_dprnn: self.ss_model = models.FaSNet_base(config) if self.config.two_stage: self.second_ss_model = models.FaSNet_base_2nd(config) for p in self.encoder.parameters(): p.requires_grad = False for p in self.decoder.parameters(): p.requires_grad = False for p in self.ss_model.parameters(): p.requires_grad = False else: self.ss_model = models.ConvTasNet(config) if self.config.two_stage: self.second_ss_model = models.ConvTasNet_2nd(config) for p in self.encoder.parameters(): p.requires_grad = False for p in self.decoder.parameters(): p.requires_grad = False for p in self.ss_model.parameters(): p.requires_grad = False else: # self.ss_model = models.SS_att(config, speech_fre, mix_speech_len, num_labels) self.ss_model = models.SS(config, speech_fre, mix_speech_len, num_labels)