def predict_acoustic(device, labels, acoustic_model, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d( _midi_to_hz(linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) pred_acoustic = acoustic_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) return pred_acoustic
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def predict_duration(device, labels, duration_model, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d( _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform(duration_linguistic_features) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) pred_durations = duration_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
def collect_features(self, wav_path): # x: Raw audio, (Sample_length, ) x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64) # f0: F0, (Frame_length, ) # lf0: log(f0) --> interp1d (Frame_length, ) # vuv: voice/unvoiced (Frame_length, ) f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms) f0 = pyworld.stonemask(x, f0, timeaxis, fs) lf0 = f0.copy() lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) lf0 = interp1d(lf0, kind="slinear") vuv = (lf0 != 0).astype(np.float32) # spec: Spectrogram, (Frame_length x Dim), Dim = 513 # bap: coded aperiodicity, (Frame_length, ) # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60 spec = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs)) # Stacking Features: total dimesnion = 64 features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec)) return features.astype(np.float32)
def _process_feature(out_dir, index, wav_path, label_path): # get list of wav files wav_files = os.listdir(os.path.dirname(wav_path)) # check wav_file assert len( wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!" fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) n_frames = len(f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # get list of lab files lab_files = os.listdir(os.path.dirname(label_path)) # check wav_file assert len( lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!" # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the acoustic to disk: acoustic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, acoustic_filename), features.astype(np.float32), allow_pickle=False) dataset_ids.append(acoustic_filename[:-4]) with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'), 'wb') as pklFile: pickle.dump(dataset_ids, pklFile) # Return a tuple describing this training example: return (acoustic_filename, n_frames, voiced_frames)
def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def collect_features(self, path): labels = hts.load(path) features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=self.add_frame_features, subphone_features=self.subphone_features) if self.log_f0_conditioning: for idx in self.pitch_idx: features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear") return features.astype(np.float32)
def predict_timelag(device, labels, timelag_model, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-30, 30]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu() # De-normalization and rounding lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy())) # Clip to the allowed range lag = np.clip(lag, allowed_range[0], allowed_range[1]) # frames -> 100 ns lag *= 50000 return lag
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms, mcep_order, windows): fs, audio = wavfile.read(wav_path) audio = audio.astype(np.float64) / 2**15 if fs != sampling_rate: audio = audio.astype(np.float32) audio = librosa.resample(audio, fs, sampling_rate) audio = (audio * 2**15).astype(np.float64) # extract f0 f0, timeaxis = pyworld.dio(audio, sampling_rate, frame_period=hop_size_in_ms) # modify f0 f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate) # voiced/unvoiced flag vuv = (f0 > 0)[:, None].astype(np.float32) # calculate log f0 lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) # interpolate f0 in log-domain lf0 = interp1d(lf0, kind='slinear')[:, None] # calculate mel-cepstrum spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate) mgc = pysptk.sp2mc(spectrogram, order=mcep_order, alpha=pysptk.util.mcepalpha(sampling_rate)) # calculate aperiodicity parameter aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate) bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate) # calculate dynamic features mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) feature = np.hstack((mgc, lf0, vuv, bap)) # cut silence frames by HTS alignment labels = hts.load(lab_path) feature = feature[:labels.num_frames()] if labels.num_frames() > len(feature): return indices = labels.silence_frame_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def _extract_static_feats(wav, sr): f0, timeaxis = pyworld.dio(wav, sr, frame_period=5) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr) aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr) mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") bap = pyworld.code_aperiodicity(aperiodicity, sr) feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32) stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]] return feats, stream_sizes
def collect_features(self, wav_path, label_path): #print(wav_path) #fs, x = wavfile.read(wav_path) d = wavio.read(wav_path) fs, x = d.rate, d.data print(fs, wav_path) if len(x.shape) > 1: x = x[:, 0] x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) #1 lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) #180 lf0 = apply_delta_windows(lf0, windows) #3 bap = apply_delta_windows(bap, windows) #3 biaobei 15 features = np.hstack((mgc, lf0, vuv, bap)) # 187 biaobei 199 #print('mgc:',mgc.shape) #print('lf0:', lf0.shape) #print('vuv:', vuv.shape) #print('bap:', bap.shape) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() if len(indices) > 0: features = np.delete(features, indices, axis=0) #print(features.shape) # return features.astype(np.float32)
def predict_acoustic(device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True): # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d(_midi_to_hz( linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range linguistic_features = np.clip(linguistic_features, acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1]) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # (B, T, D_out) max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * acoustic_out_scaler.var_ max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_acoustic = acoustic_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features) return pred_acoustic
def gen_waveform(labels, acoustic_features, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) # 音量を小さくする(音割れ防止) # TODO: ここのかける定数をいい感じにする spectrogram *= 0.000000001 sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60) return f0, sp, bap, generated_waveform
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding") f0_score = _midi_to_hz(l_features, self.pitch_idx, False) notes = l_features[:, self.pitch_idx] notes = notes[notes > 0] # allow 1-tone upper/lower min_f0 = librosa.midi_to_hz(min(notes) - 2) max_f0 = librosa.midi_to_hz(max(notes) + 2) assert max_f0 > min_f0 fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if self.use_harvest: f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0) else: f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period, f0_floor=min_f0, f0_ceil=max_f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)) # F0 of speech f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") # Adjust lengths mgc = mgc[:labels.num_frames()] lf0 = lf0[:labels.num_frames()] vuv = vuv[:labels.num_frames()] bap = bap[:labels.num_frames()] if self.relative_f0: # # F0 derived from the musical score f0_score = f0_score[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # relative f0 diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) f0_target = diff_lf0 else: f0_target = lf0 mgc = apply_delta_windows(mgc, self.windows) f0_target = apply_delta_windows(f0_target, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32) # Align waveform and features wave = x.astype(np.float32) / 2**15 T = int(features.shape[0] * (fs * self.frame_period / 1000)) if len(wave) < T: if T - len(wave) > 100: print("Warn!!", T, len(wave), T-len(wave)) print("you have unepxcted input. Please debug though ipdb") import ipdb; ipdb.set_trace() else: pass wave = np.pad(wave, (0, T-len(wave))) assert wave.shape[0] >= T wave = wave[:T] return features, wave
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features(labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding") f0_score = midi_to_hz(l_features, self.pitch_idx, False) # TODO: better to set the margin carefully max_f0 = int(max(f0_score)) + 100 min_f0 = int(max(self.f0_floor, min(f0_score[f0_score > 0]) - 20)) assert max_f0 > min_f0 fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if self.use_harvest: f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0) else: f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period, f0_floor=min_f0, f0_ceil=max_f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)) # F0 of speech f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") # # F0 derived from the musical score f0_score = f0_score[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # Adjust lengths mgc = mgc[:labels.num_frames()] lf0 = lf0[:labels.num_frames()] vuv = vuv[:labels.num_frames()] bap = bap[:labels.num_frames()] diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) mgc = apply_delta_windows(mgc, self.windows) diff_lf0 = apply_delta_windows(diff_lf0, self.windows) bap = apply_delta_windows(bap, self.windows) features = np.hstack((mgc, diff_lf0, vuv, bap)) return features.astype(np.float32)
def test_interp1d(): f0 = np.random.rand(100, 1).astype(np.float32) f0[len(f0) // 2] = 0 assert not np.all(f0 != 0) if0 = interp1d(f0) assert np.all(if0 != 0)
def gen_spsvs_static_features( labels, acoustic_features, binary_dict, numeric_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", pitch_idx=None, num_windows=3, frame_period=5, relative_f0=True, vibrato_scale=1.0, vuv_threshold=0.3, force_fix_vuv=True, ): """Generate static features from predicted acoustic features Args: labels (HTSLabelFile): HTS labels acoustic_features (ndarray): predicted acoustic features binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary stream_sizes (list): stream sizes has_dynamic_features (list): whether each stream has dynamic features subphone_features (str): subphone feature type pitch_idx (int): index of pitch features num_windows (int): number of windows frame_period (float): frame period relative_f0 (bool): whether to use relative f0 vibrato_scale (float): vibrato scale vuv_threshold (float): vuv threshold force_fix_vuv (bool): whether to use post-processing to fix VUV. Returns: tuple: tuple of mgc, lf0, vuv and bap. """ if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, num_windows ) else: static_stream_sizes = stream_sizes # Copy here to avoid inplace operations on input acoustic features acoustic_features = acoustic_features.copy() # Split multi-stream features streams = split_streams(acoustic_features, static_stream_sizes) if len(streams) == 4: mgc, target_f0, vuv, bap = streams vib, vib_flags = None, None elif len(streams) == 5: # Assuming diff-based vibrato parameters mgc, target_f0, vuv, bap, vib = streams vib_flags = None elif len(streams) == 6: # Assuming sine-based vibrato parameters mgc, target_f0, vuv, bap, vib, vib_flags = streams else: raise RuntimeError("Not supported streams") linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features=subphone_features, ) # Correct V/UV based on special phone flags if force_fix_vuv: vuv = correct_vuv_by_phone(vuv, binary_dict, linguistic_features) # F0 if relative_f0: diff_lf0 = target_f0 f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < vuv_threshold] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < vuv_threshold] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) if vib is not None: if vib_flags is not None: # Generate sine-based vibrato vib_flags = vib_flags.flatten() m_a, m_f = vib[:, 0], vib[:, 1] # Fill zeros for non-vibrato frames m_a[vib_flags < 0.5] = 0 m_f[vib_flags < 0.5] = 0 # Gen vibrato sr_f0 = int(1 / (frame_period * 0.001)) f0 = gen_sine_vibrato(f0.flatten(), sr_f0, m_a, m_f, vibrato_scale) else: # Generate diff-based vibrato f0 = f0.flatten() + vibrato_scale * vib.flatten() # NOTE: Back to log-domain for convenience lf0 = f0.copy() lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)]) # NOTE: interpolation is necessary lf0 = interp1d(lf0, kind="slinear") lf0 = lf0[:, None] if len(lf0.shape) == 1 else lf0 vuv = vuv[:, None] if len(vuv.shape) == 1 else vuv return mgc, lf0, vuv, bap
def collect_features(self, wav_path, label_path): labels = hts.load(label_path) l_features = fe.linguistic_features( labels, self.binary_dict, self.continuous_dict, add_frame_features=True, subphone_features="coarse_coding", ) f0_score = _midi_to_hz(l_features, self.pitch_idx, False) notes = l_features[:, self.pitch_idx] notes = notes[notes > 0] # allow 200 cent upper/lower to properly handle F0 estimation of # preparation, vibrato and overshoot. # NOET: set the minimum f0 to 63.5 Hz (125 - 3*20.5) # https://acoustics.jp/qanda/answer/50.html # NOTE: sinsy allows 30-150 cent frequency range for vibrato (as of 2010) # https://staff.aist.go.jp/m.goto/PAPER/SIGMUS201007oura.pdf min_f0 = max(63.5, librosa.midi_to_hz(min(notes) - 2)) max_f0 = librosa.midi_to_hz(max(notes) + 2) assert max_f0 > min_f0 # Workaround segfault issues of WORLD's CheapTrick min_f0 = min(min_f0, 500) fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if fs != self.sample_rate: raise RuntimeError( "Sample rate mismatch! {} != {}".format(fs, self.sample_rate) ) if self.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0 ) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0 ) f0 = pyworld.stonemask(x, f0, timeaxis, fs) # Workaround for https://github.com/r9y9/nnsvs/issues/7 f0 = np.maximum(f0, 0) # Correct V/UV (and F0) based on the musical score information # treat frames where musical notes are not assigned as unvoiced if self.correct_vuv: # Use smoothed mask so that we don't mask out overshoot or something # that could happen at the start/end of notes # 0.5 sec. window (could be tuned for better results) win_length = int(0.5 / (self.frame_period * 0.001)) mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same") if len(f0) > len(mask): mask = np.pad(mask, (0, len(f0) - len(mask)), "constant") elif len(f0) < len(mask): mask = mask[: len(f0)] f0 = f0 * np.sign(mask) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if self.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) # F0 -> continuous F0 lf0 = interp1d(lf0, kind="slinear") # Vibrato parameter extraction sr_f0 = int(1 / (self.frame_period * 0.001)) if self.vibrato_mode == "sine": win_length = 64 n_fft = 256 threshold = 0.12 if self.use_harvest: # NOTE: harvest is not supported here since the current implemented algorithm # relies on v/uv flags to find vibrato sections. # We use DIO since it provides more accurate v/uv detection in my experience. _f0, _timeaxis = pyworld.dio( x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0, ) _f0 = pyworld.stonemask(x, _f0, _timeaxis, fs) f0_smooth = extract_smoothed_f0(_f0, sr_f0, cutoff=8) else: f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8) f0_smooth_cent = hz_to_cent_based_c4(f0_smooth) vibrato_likelihood = extract_vibrato_likelihood( f0_smooth_cent, sr_f0, win_length=win_length, n_fft=n_fft ) vib_flags, m_a, m_f = extract_vibrato_parameters( f0_smooth_cent, vibrato_likelihood, sr_f0, threshold=threshold ) m_a = interp1d(m_a, kind="linear") m_f = interp1d(m_f, kind="linear") vib = np.stack([m_a, m_f], axis=1) vib_flags = vib_flags[:, np.newaxis] elif self.vibrato_mode == "diff": # NOTE: vibrato is known to have 3 ~ 8 Hz range (in general) # remove higher frequency than 3 to separate vibrato from the original F0 f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=3) vib = (f0 - f0_smooth)[:, np.newaxis] vib_flags = None elif self.vibrato_mode == "none": vib, vib_flags = None, None else: raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode)) mgc = pysptk.sp2mc( spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs) ) # Post-processing for aperiodicy # ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py if self.interp_unvoiced_aperiodicity: is_voiced = (vuv > 0).reshape(-1) if not np.any(is_voiced): pass # all unvoiced, do nothing else: for k in range(aperiodicity.shape[1]): aperiodicity[~is_voiced, k] = np.interp( np.where(~is_voiced)[0], np.where(is_voiced)[0], aperiodicity[is_voiced, k], ) bap = pyworld.code_aperiodicity(aperiodicity, fs) # Parameter trajectory smoothing if self.trajectory_smoothing: modfs = int(1 / 0.005) for d in range(mgc.shape[1]): mgc[:, d] = lowpass_filter( mgc[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff ) for d in range(bap.shape[1]): bap[:, d] = lowpass_filter( bap[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff ) # Adjust lengths mgc = mgc[: labels.num_frames()] lf0 = lf0[: labels.num_frames()] vuv = vuv[: labels.num_frames()] bap = bap[: labels.num_frames()] vib = vib[: labels.num_frames()] if vib is not None else None vib_flags = vib_flags[: labels.num_frames()] if vib_flags is not None else None if self.relative_f0: # # F0 derived from the musical score f0_score = f0_score[:, None] if len(f0_score) > len(f0): print( "Warning! likely to have mistakes in alignment in {}".format( label_path ) ) print(f0_score.shape, f0.shape) f0_score = f0_score[: len(f0)] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(f0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") # relative f0 diff_lf0 = lf0 - lf0_score diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0)) f0_target = diff_lf0 else: f0_target = lf0 mgc = apply_delta_windows(mgc, self.windows) f0_target = apply_delta_windows(f0_target, self.windows) bap = apply_delta_windows(bap, self.windows) vib = apply_delta_windows(vib, self.windows) if vib is not None else None if vib is None and vib_flags is None: features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32) elif vib is not None and vib_flags is None: features = np.hstack((mgc, f0_target, vuv, bap, vib)).astype(np.float32) elif vib is not None and vib_flags is not None: features = np.hstack((mgc, f0_target, vuv, bap, vib, vib_flags)).astype( np.float32 ) else: raise RuntimeError("Unknown combination of features") # Align waveform and features wave = x.astype(np.float32) / 2 ** 15 T = int(features.shape[0] * (fs * self.frame_period / 1000)) if len(wave) < T: if T - len(wave) > int(fs * 0.005): print("Warn!!", T, len(wave), T - len(wave)) print("you have unepxcted input. Please debug though ipdb") import ipdb ipdb.set_trace() else: pass wave = np.pad(wave, (0, T - len(wave))) assert wave.shape[0] >= T wave = wave[:T] return features, wave
def predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=None, allowed_range_rest=None, force_clip_input_features=False, ): """Predict time-lag from HTS labels Args: device (torch.device): device labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels timelag_model (nn.Module): time-lag model timelag_config (dict): time-lag model config timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler binary_dict (dict): binary feature dict numeric_dict (dict): numeric feature dict pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to condition on log f0 allowed_range (list): allowed range of time-lag allowed_range_rest (list): allowed range of time-lag for rest force_clip_input_features (bool): whether to clip input features Returns; ndarray: time-lag predictions """ if allowed_range is None: allowed_range = [-20, 20] if allowed_range_rest is None: allowed_range_rest = [-40, 40] # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d( _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features ) if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(timelag_linguistic_features.shape[1]) if idx not in pitch_indices ] timelag_linguistic_features[:, non_pitch_indices] = np.clip( timelag_linguistic_features[:, non_pitch_indices], timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1], ) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) else: # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_timelag = ( timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features, ) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1] ) else: pred_timelag[idx] = np.clip( pred_timelag[idx], allowed_range[0], allowed_range[1] ) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
def predict_acoustic( device, labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, numeric_dict, subphone_features="coarse_coding", pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict acoustic features from HTS labels MLPG is applied to the predicted features if the output features have dynamic features. Args: device (torch.device): device to use labels (HTSLabelFile): HTS labels acoustic_model (nn.Module): acoustic model acoustic_config (AcousticConfig): acoustic configuration acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary subphone_features (str): subphone feature type pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log f0 conditioning force_clip_input_features (bool): whether to force clip input features Returns: ndarray: predicted acoustic features """ # Musical/linguistic features linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=True, subphone_features=subphone_features, ) if log_f0_conditioning: for idx in pitch_indices: linguistic_features[:, idx] = interp1d( _midi_to_hz(linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization linguistic_features = acoustic_in_scaler.transform(linguistic_features) if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(linguistic_features.shape[1]) if idx not in pitch_indices ] linguistic_features[:, non_pitch_indices] = np.clip( linguistic_features[:, non_pitch_indices], acoustic_in_scaler.feature_range[0], acoustic_in_scaler.feature_range[1], ) # Predict acoustic features x = torch.from_numpy(linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]]) if np.any(acoustic_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) else: # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) else: # (T, D_out) pred_acoustic = ( acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic) if np.any(acoustic_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_acoustic = multi_stream_mlpg( pred_acoustic, acoustic_out_scaler.var_, get_windows(acoustic_config.num_windows), acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, ) return pred_acoustic
def predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, binary_dict, numeric_dict, pitch_indices=None, log_f0_conditioning=True, force_clip_input_features=False, ): """Predict phoneme durations from HTS labels Args: device (torch.device): device to run the model on labels (nnmnkwii.io.hts.HTSLabelFile): labels duration_model (nn.Module): duration model duration_config (dict): duration config duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler binary_dict (dict): binary feature dictionary numeric_dict (dict): numeric feature dictionary pitch_indices (list): indices of pitch features log_f0_conditioning (bool): whether to use log-f0 conditioning force_clip_input_features (bool): whether to clip input features Returns: np.ndarray: predicted durations """ # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, numeric_dict, add_frame_features=False, subphone_features=None, ).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d( _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning), kind="slinear", ) # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features ) if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range (except for pitch-related features) non_pitch_indices = [ idx for idx in range(duration_linguistic_features.shape[1]) if idx not in pitch_indices ] duration_linguistic_features[:, non_pitch_indices] = np.clip( duration_linguistic_features[:, non_pitch_indices], duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1], ) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) max_mu, max_sigma = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): raise RuntimeError( "Dynamic features are not supported for duration modeling" ) # Apply denormalization max_sigma_sq = ( max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_ ) max_sigma_sq = np.maximum(max_sigma_sq, 1e-14) max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy() ) return max_mu, max_sigma_sq else: # (T, D_out) pred_durations = ( duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy() ) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features, ) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations
win_length=win_length, n_fft=n_fft) results, m_a, m_f = extract_vibrato_parameters(f0_smooth_cent, vibrato_likelihood, sr_f0, threshold=threshold) fig, ax = plt.subplots(3, 1, figsize=(16, 12), sharex=True) ax[0].plot(timeaxis, f0, label="Original F0") ax[0].plot(timeaxis, f0_smooth, label="Smoothed F0") ax[0].plot(timeaxis, results * 15, "*", label="Vibrato sections") ax[0].set_ylim(12) ax[0].set_ylabel("Frequency [cent]") ax[0].legend() ax[0].set_title("F0") ax[1].plot(timeaxis, interp1d(m_a)) ax[1].set_title("m_a(t)") ax[1].set_ylabel("Frequency [cent]") ax[2].plot(timeaxis, interp1d(m_f)) ax[2].set_title("m_f(t)") ax[2].set_ylabel("Frequency [Hz]") plt.tight_layout() plt.show() # Let's reconstruct vibrato f0_no_vib = f0.copy() segments = nonzero_segments(f0) for s, e in segments: f0_no_vib[s:e] = lowpass_filter(f0[s:e], sr_f0, cutoff=1) f0_gen = gen_sine_vibrato(f0_no_vib, sr_f0, m_a, m_f)
def predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True, allowed_range=[-20, 20], allowed_range_rest=[-40, 40]): # round start/end times just in case. labels.round_() # Extract note-level labels note_indices = get_note_indices(labels) note_labels = labels[note_indices] # Extract musical/linguistic context timelag_linguistic_features = fe.linguistic_features( note_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Adjust input features if we use log-f0 conditioning if log_f0_conditioning: if pitch_indices is None: raise ValueError("Pitch feature indices must be specified!") for idx in pitch_indices: timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz( timelag_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Normalization timelag_linguistic_features = timelag_in_scaler.transform( timelag_linguistic_features) if isinstance(timelag_in_scaler, MinMaxScaler): # clip to feature range timelag_linguistic_features = np.clip( timelag_linguistic_features, timelag_in_scaler.feature_range[0], timelag_in_scaler.feature_range[1]) # Run model x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device) # Run model if timelag_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]]) if np.any(timelag_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * timelag_out_scaler.var_ max_mu = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_timelag = timelag_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag) if np.any(timelag_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_timelag = multi_stream_mlpg( pred_timelag, timelag_out_scaler.var_, get_windows(timelag_config.num_windows), timelag_config.stream_sizes, timelag_config.has_dynamic_features) # Rounding pred_timelag = np.round(pred_timelag) # Clip to the allowed range for idx in range(len(pred_timelag)): if _is_silence(note_labels.contexts[idx]): pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]) else: pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0], allowed_range[1]) # frames -> 100 ns pred_timelag *= 50000 return pred_timelag
sp = pyworld.cheaptrick(x, f0, timeaxis, fs, fft_size=fft_len) # Spectrogram ap = pyworld.d4c(x, f0, timeaxis, fs, fft_size=fft_len) # Aperiodicity plt.subplot(3, 1, 1) plt.plot(f0) plt.subplot(3, 1, 2) plt.plot(lf0) plt.subplot(3, 1, 3) librosa.display(sp.T, sr=sr, hop_length=hop_length, y_axis='linear') plt.show() y = pyworld.synthesize(f0, sp, ap, fs, frame_period) play_audio(y) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=self.order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, self.windows) lf0 = apply_delta_windows(lf0, self.windows) bap = apply_delta_windows(bap, self.windows)
def predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True): # Extract musical/linguistic features duration_linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) if log_f0_conditioning: for idx in pitch_indices: duration_linguistic_features[:, idx] = interp1d(_midi_to_hz( duration_linguistic_features, idx, log_f0_conditioning), kind="slinear") # Apply normalization duration_linguistic_features = duration_in_scaler.transform( duration_linguistic_features) if isinstance(duration_in_scaler, MinMaxScaler): # clip to feature range duration_linguistic_features = np.clip( duration_linguistic_features, duration_in_scaler.feature_range[0], duration_in_scaler.feature_range[1]) # Apply model x = torch.from_numpy(duration_linguistic_features).float().to(device) x = x.view(1, -1, x.size(-1)) if duration_model.prediction_type() == PredictionType.PROBABILISTIC: # (B, T, D_out) log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]]) if np.any(duration_config.has_dynamic_features): max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * duration_out_scaler.var_ max_mu = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) else: _, max_mu = mdn_get_most_probable_sigma_and_mu( log_pi, log_sigma, mu) # Apply denormalization pred_durations = duration_out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) else: # (T, D_out) pred_durations = duration_model.inference( x, [x.shape[1]]).squeeze(0).cpu().data.numpy() # Apply denormalization pred_durations = duration_out_scaler.inverse_transform(pred_durations) if np.any(duration_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_durations = multi_stream_mlpg( pred_durations, duration_out_scaler.var_, get_windows(duration_config.num_windows), duration_config.stream_sizes, duration_config.has_dynamic_features) pred_durations[pred_durations <= 0] = 1 pred_durations = np.round(pred_durations) return pred_durations