def __call__(self, aco_tensor): # aco_tensor: [T, cc_order + 2] dimensional, where T are frames if apt is None: raise ValueError('Please install ahoproc_tools to ' 'process ahocoder data') # voiced frequency is [-2] dim fv = aco_tensor[:, -2].contiguous().view(-1, 1) fv_interp, uv = apt.interpolation(fv.numpy(), self.fv_k) i_fv_t = torch.FloatTensor(fv_interp) if self.normalize: i_fv_t = i_fv_t / 1000 # lf0 is [-1] dim lf0 = aco_tensor[:, -1].contiguous().view(-1, 1) lf0_interp, uv = apt.interpolation(lf0.numpy(), self.lf0_k) if np.any(lf0_interp <= self.lf0_k): # totally unvoiced segment, put min F0 lf0_interp = np.log(60) * np.ones(lf0_interp.shape) uv = np.zeros(uv.shape) i_lf0_t = torch.FloatTensor(lf0_interp) uv_t = torch.FloatTensor(np.array(uv, dtype=np.float32)) # compose final tensor with +1 dim aco_tensor = torch.cat((aco_tensor[:, :-2], i_fv_t, i_lf0_t), dim=1) if self.stats is not None: aco_tensor = self.normalizer(aco_tensor) aco_tensor = torch.cat((aco_tensor, uv_t), dim=1) return aco_tensor
def __call__(self, pkg, cached_file=None): pkg = format_package(pkg) wav = pkg['chunk'] wav = wav.data.numpy() max_frames = wav.shape[0] // self.hop if cached_file is not None: # load pre-computed data proso = torch.load(cached_file) beg_i = pkg['chunk_beg_i'] // self.hop end_i = pkg['chunk_end_i'] // self.hop proso = proso[:, beg_i:end_i] pkg[self.name] = proso else: # first compute logF0 and voiced/unvoiced flag # f0 = pysptk.rapt(wav.astype(np.float32), # fs=self.sr, hopsize=self.hop, # min=self.f0_min, max=self.f0_max, # otype='f0') f0 = pysptk.swipe(wav.astype(np.float64), fs=self.sr, hopsize=self.hop, min=self.f0_min, max=self.f0_max, otype='f0') # sound = pm.Sound(wav.astype(np.float32), self.sr) # f0 = sound.to_pitch(self.hop / 16000).selected_array['frequency'] if len(f0) < max_frames: pad = max_frames - len(f0) f0 = np.concatenate((f0, f0[-pad:]), axis=0) lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames] uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames] if torch.sum(uv) == 0: # if frame is completely unvoiced, make lf0 min val lf0 = torch.ones(uv.size()) * np.log(self.f0_min) # assert lf0.min() > 0, lf0.data.numpy() # secondly obtain zcr zcr = librosa.feature.zero_crossing_rate(y=wav, frame_length=self.win, hop_length=self.hop) zcr = torch.tensor(zcr.astype(np.float32)) zcr = zcr[:, :max_frames] # finally obtain energy egy = librosa.feature.rmse(y=wav, frame_length=self.win, hop_length=self.hop, pad_mode='constant') egy = torch.tensor(egy.astype(np.float32)) egy = egy[:, :max_frames] proso = torch.cat((lf0, uv, egy, zcr), dim=0) if self.der_order > 0 : deltas=[proso] for n in range(1,self.der_order+1): deltas.append(librosa.feature.delta(proso.numpy(),order=n)) proso=torch.from_numpy(np.concatenate(deltas)) pkg[self.name] = proso # Overwrite resolution to hop length pkg['dec_resolution'] = self.hop return pkg
def main(opts): for ai, afile in tqdm.tqdm(enumerate(opts.arff_files), total=len(opts.arff_files)): with open(afile) as af: data = arff.load(af) attrs = [at[0] for at in data['attributes']] f0_idx = attrs.index('F0_sma') data = data['data'] array = [] X = [] for dpoint in data: # ignore name, timestamp and class f0_val = dpoint[f0_idx] if f0_val > 0: dpoint[f0_idx] = np.log(f0_val) else: dpoint[f0_idx] = -1e10 array.append(dpoint[2:-1]) array = np.array(array, dtype=np.float32) lf0, _ = interpolation(array[:, -1], -1e10) array[:, -1] = lf0 if opts.out_stats is not None: X.append(array) npfile = os.path.splitext(afile)[0] np.save(os.path.join(npfile), array.T) if opts.out_stats is not None: X = np.concatenate(X, axis=0) mn = np.mean(X, axis=0) sd = np.std(X, axis=0) with open(opts.out_stats, 'wb') as out_f: pickle.dump({'mean':mn, 'std':sd}, out_f)
def __call__(self, pkg, cached_file=None): pkg = format_package(pkg) wav = pkg['chunk'] wav = wav.data.numpy() max_frames = wav.shape[0] // self.hop if cached_file is not None: # load pre-computed data proso = torch.load(cached_file) beg_i = pkg['chunk_beg_i'] // self.hop end_i = pkg['chunk_end_i'] // self.hop proso = proso[:, beg_i:end_i] pkg['prosody'] = proso else: # first compute logF0 and voiced/unvoiced flag f0 = pysptk.swipe(wav.astype(np.float64), fs=self.sr, hopsize=self.hop, min=self.f0_min, max=self.f0_max, otype='f0') lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) lf0 = torch.tensor(lf0.astype( np.float32)).unsqueeze(0)[:, :max_frames] uv = torch.tensor(uv.astype( np.float32)).unsqueeze(0)[:, :max_frames] if torch.sum(uv) == 0: # if frame is completely unvoiced, make lf0 min val lf0 = torch.ones(uv.size()) * np.log(self.f0_min) assert lf0.min() > 0, lf0.data.numpy() # secondly obtain zcr zcr = librosa.feature.zero_crossing_rate(y=wav, frame_length=self.win, hop_length=self.hop) zcr = torch.tensor(zcr.astype(np.float32)) zcr = zcr[:, :max_frames] # finally obtain energy egy = librosa.feature.rmse(y=wav, frame_length=self.win, hop_length=self.hop, pad_mode='constant') egy = torch.tensor(egy.astype(np.float32)) egy = egy[:, :max_frames] proso = torch.cat((lf0, uv, egy, zcr), dim=0) pkg['prosody'] = proso return pkg
def __call__(self, tensor): """ Args: tensor (Tensor): Tensor of audio of size (samples x 1) """ # pysptk and interpolate are a MUST in this transform import pysptk from ahoproc_tools.interpolate import interpolation t_npy = tensor.cpu().squeeze(1).numpy() #print('t_npy shape: ', t_npy.shape) seqlen = t_npy.shape[0] T = seqlen // self.hop_length # compute LF0 and UV f0 = pysptk.swipe(t_npy.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="f0")[:T] lf0 = np.log(f0 + 1e-10) lf0, uv = interpolation(lf0, -1) if np.any(lf0 == np.log(1e-10)): # all lf0 goes to minf0 as a PAD symbol lf0 = np.ones(lf0.shape) * np.log(60) # all frames are unvoiced uv = np.zeros(uv.shape) ret = { 'lf0': torch.FloatTensor(lf0).view(-1, 1), 'uv': torch.FloatTensor(uv.astype(np.float32)).view(-1, 1) } tot_frames = T # MelSpectrum and MFCCs mel = self.mel(tensor).transpose(0, 1).squeeze(2) # do compression? if self.dynamic_norm_spec: mel = torch.log1p(mel * 10000) / torch.log(torch.FloatTensor([10])) ret['mel_spec'] = mel[:tot_frames] mfcc = librosa.feature.mfcc(y=t_npy, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.mfcc_order).T mfcc = mfcc[:tot_frames] ret['mfcc'] = torch.FloatTensor(mfcc) # Spectrogram abs magnitude [dB] spec = librosa.stft(t_npy, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window) spec_db = librosa.amplitude_to_db(spec).T spec_ang = np.angle(spec).T spec_db = spec_db[:tot_frames] spec_ang = spec_ang[:tot_frames] ret['mag'] = torch.FloatTensor(spec_db) ret['pha'] = torch.FloatTensor(spec_ang) # ZCR, E and lF0 egy = librosa.feature.rmse(y=t_npy, frame_length=self.win_length, hop_length=self.hop_length, pad_mode='constant').T egy = egy[:tot_frames] zcr = librosa.feature.zero_crossing_rate(y=t_npy, frame_length=self.win_length, hop_length=self.hop_length).T zcr = zcr[:tot_frames] ret['egy'] = torch.FloatTensor(egy) ret['zcr'] = torch.FloatTensor(zcr) ntensor = tensor.clone() if hasattr(self, 'chopper'): do_chop = random.random() > 0.5 if do_chop: ntensor = self.chopper(ntensor, self.sr) if hasattr(self, 'additive'): do_add = random.random() > 0.5 if do_add: ntensor = self.additive(ntensor.numpy(), self.sr) if hasattr(self, 'clipping'): do_clip = random.random() > 0.5 if do_clip: ntensor = self.clipping(ntensor.numpy()) ret['wav'] = ntensor.view((-1, 1)) ret['cwav'] = tensor.view((-1, 1)) return ret