def __init__(self, pre_params, n_quant): super(PreProcess, self).__init__() self.mfcc = mfcc.ProcessWav(**pre_params, name='mfcc') self.rf = self.mfcc.rf self.n_quant = n_quant self.register_buffer('quant_onehot', torch.eye(self.n_quant)) # A dummy buffer that simply allows querying the current model device self.register_buffer('dummy_buf', torch.empty(0))
def __init__(self, hps): super(MfccInverter, self).__init__() self.bn_type = 'none' self.mfcc = mfcc.ProcessWav( sample_rate=hps.sample_rate, win_sz=hps.mfcc_win_sz, hop_sz=hps.mfcc_hop_sz, n_mels=hps.n_mels, n_mfcc=hps.n_mfcc) mfcc_vc = vconv.VirtualConv(filter_info=hps.mfcc_win_sz, stride=hps.mfcc_hop_sz, parent=None, name='MFCC') self.wavenet = wn.WaveNet(hps, parent_vc=mfcc_vc) self.objective = wn.RecLoss() self._init_geometry(hps.n_win_batch)
def _initialize(self): super(Slice, self).__init__() self.target_device = None self.__dict__.update(self.init_args) self.jitter = jitter.Jitter(self.jitter_prob) self.mfcc_proc = mfcc.ProcessWav( sample_rate=self.sample_rate, win_sz=self.mfcc_win_sz, hop_sz=self.mfcc_hop_sz, n_mels=self.n_mels, n_mfcc=self.n_mfcc) self.mfcc_vc = vconv.VirtualConv(filter_info=self.mfcc_win_sz, stride=self.mfcc_hop_sz, parent=None, name='MFCC')
def __init__(self, n_mid, sample_rate_ms, win_length_ms, hop_length_ms, n_mels, n_mfcc): super(Encoder, self).__init__() self.pre = mfcc.ProcessWav(sample_rate_ms, win_length_ms, hop_length_ms, n_mels, n_mfcc) n_in = self.pre.n_out self.net = nn.Sequential( ConvReLURes(n_in, n_mid, 3, do_res=False), ConvReLURes(n_mid, n_mid, 3), ConvReLURes(n_mid, n_mid, 4, stride=2, do_res=False), ConvReLURes(n_mid, n_mid, 3), ConvReLURes(n_mid, n_mid, 3), ConvReLURes(n_mid, n_mid, 1), ConvReLURes(n_mid, n_mid, 1), ConvReLURes(n_mid, n_mid, 1), ConvReLURes(n_mid, n_mid, 1))
def convert(catalog, pfx, n_quant, sample_rate=16000, win_sz=400, hop_sz=160, n_mels=80, n_mfcc=13): mfcc_proc = mfcc.ProcessWav(sample_rate, win_sz, hop_sz, n_mels, n_mfcc) if n_quant <= 2**8: snd_dtype = np.uint8 elif n_quant <= 2**15: snd_dtype = np.int16 else: snd_dtype = np.int32 snd_file = pfx + '.dat' ind_file = pfx + '.ind' mel_file = pfx + '.mel' ind = {'voice_id': [], 'n_snd_elem': [], 'n_mel_elem': [], 'snd_path': []} n_snd_elem = 0 n_mel_elem = 0 n_mel_chan = None with open(snd_file, 'wb') as snd_fh, open(mel_file, 'wb') as mel_fh: for (voice_id, snd_path) in catalog: snd, _ = librosa.load(snd_path, sample_rate) snd_mu = util.mu_encode_np(snd, n_quant).astype(snd_dtype) # mel: C, T (n_mels, n_timesteps) # reshape to T, C and flatten mel = mfcc_proc.func(snd) if n_mel_chan is None: n_mel_chan = mel.shape[0] mel = mel.transpose((1, 0)).flatten() snd_fh.write(snd_mu.data) mel_fh.write(mel.data) ind['voice_id'].append(voice_id) ind['n_snd_elem'].append(snd.size) ind['n_mel_elem'].append(mel.size) ind['snd_path'].append(snd_path) if len(ind['voice_id']) % 100 == 0: print('Converted {} files of {}.'.format(len(ind['voice_id']), len(catalog), file=stderr)) stderr.flush() n_snd_elem += snd.size n_mel_elem += mel.size with open(ind_file, 'wb') as ind_fh: index = { 'window_size': win_sz, 'hop_size': hop_sz, 'n_snd_elem': n_snd_elem, 'n_mel_elem': n_mel_elem, 'n_mel_chan': n_mel_chan, 'snd_dtype': snd_dtype, 'n_quant': n_quant } index.update(ind) pickle.dump(index, ind_fh)