def strikes_and_notes(path): y, fs = librosa.core.load(path, offset=0.0, duration=None) t = librosa.times_like(y, sr=fs) strikes = librosa.onset.onset_detect(y, sr=fs, units='samples') played_times = [] played_notes = [] for i in range(len(strikes)): if i == len(strikes) - 1: window = y[strikes[i]:min(len(y), 2 * strikes[i] - strikes[i - 1])] else: window = y[strikes[i]:strikes[i + 1]] f0, voiced_flag, voiced_probs = librosa.pyin( window, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), fill_na=None) f0_est = np.median(f0[~np.isnan(f0)]) if ~np.isnan(f0_est): played_notes.append(f0_est) played_times.append(librosa.samples_to_time(strikes, sr=fs)) return played_times, played_notes
def process(self): try: audioFilePath = self.args["file_path"] sr, sig = wavread(audioFilePath) if sr != 22500: sr = 22500 sig = resample(sig, sr) pitches, voiced_flag, voiced_probs = librosa.pyin( sig, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7') ) # recommended settings from docs are bad, need to figure out good ones #pitches = pitches[pitches != 0] yin_min_pitch = np.nanmin(pitches).item() yin_max_pitch = np.nanmax(pitches).item() yin_mean_pitch = np.nanmean(pitches).item() yin_median_pitch = np.nanmedian(pitches).item() print("yin_median_pitch") print("yin pitches", pitches) return { 'min_pitch_yin': yin_min_pitch, 'max_pitch_yin': yin_max_pitch, 'mean_pitch_yin': yin_mean_pitch, 'median_pitch_yin': yin_median_pitch, } except Exception as e: print(e) return { 'min_pitch_yin': "Measure Pitch Yin Failed", 'max_pitch_yin': "Measure Pitch Yin Failed", 'mean_pitch_yin': "Measure Pitch Yin Failed", 'median_pitch_yin': "Measure Pitch Yin Failed", }
def test_pitch_shift_transform_with_pitch_detection(): """To check semi-tone values, check: http://www.homepages.ucl.ac.uk/~sslyjjt/speech/semitone.html""" source_frequency = 440 max_semitone_shift = 4 expected_frequency_shift = 554 num_channels = 1 audio = generate_waveform(sample_rate, num_samples, num_channels, frequency=source_frequency) pitch_shift = PitchShift( n_samples=num_samples, sample_rate=sample_rate, pitch_shift_min=max_semitone_shift, pitch_shift_max=max_semitone_shift + 1, ) t_audio = pitch_shift(audio) librosa_audio = t_audio[0].numpy() f0_hz, _, _ = librosa.pyin(librosa_audio, fmin=10, fmax=1000) # remove nan values: f0_hz = f0_hz[~np.isnan(f0_hz)] detected_f0_hz = np.max(f0_hz) detection_threshold_in_hz = 40 # the detected frequency vs. expected frequency should not be smaller than 40Hz. assert abs(detected_f0_hz - expected_frequency_shift) < detection_threshold_in_hz
def extract_feature(audio, sr=44100): """ extract feature like below: sig: rmse: silence: harmonic: pitch: audio: audio file or audio list return feature_list: np of [n_samples, n_features] """ feature_list = [] y = [] if isinstance(audio, str): y, _ = librosa.load(audio, sr) elif isinstance(audio, np.ndarray): y = audio # 1. sig sig_mean = np.mean(abs(y)) feature_list.append(sig_mean) # sig_mean feature_list.append(np.std(y)) # sig_std # 2. rmse rmse = librosa.feature.rms(y + 0.0001)[0] feature_list.append(np.mean(rmse)) # rmse_mean feature_list.append(np.std(rmse)) # rmse_std # 3. silence silence = 0 for e in rmse: if e <= 0.4 * np.mean(rmse): silence += 1 silence /= float(len(rmse)) feature_list.append(silence) # silence # 4. harmonic y_harmonic = librosa.effects.hpss(y)[0] feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000) # 5. pitch (instead of auto_correlation) cl = 0.45 * sig_mean center_clipped = [] for s in y: if s >= cl: center_clipped.append(s - cl) elif s <= -cl: center_clipped.append(s + cl) elif np.abs(s) < cl: center_clipped.append(0) # auto_corrs = librosa.core.autocorrelate(np.array(center_clipped)) pitch, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) pitch = [0 if math.isnan(p) else p for p in pitch] feature_list.append(np.mean(pitch)) feature_list.append(np.std(pitch)) return np.array(feature_list).reshape(1, -1)
def get_f0_series(snd_filename, fmin=VIOLIN_MIN_F, fmax=VIOLIN_MAX_F): """Extract f0 history from sound file.""" y, sr = librosa.load(snd_filename) f0, voiced_flag, voiced_probs = \ librosa.pyin(y, fmin=VIOLIN_MIN_F, fmax=VIOLIN_MAX_F) return y, f0
def wav2f0(y, sr): f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C6')) # f0 = np.nan_to_num(f0) # get rid of nans f0_times = librosa.times_like(f0) # D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) return f0, voiced_flag, voiced_probs, f0_times
def track_pitch(x, sr, frame_length, fmin=165, fmax=1500): x = array(x) x_arr = x / max(abs(x)) return librosa.pyin(x_arr, sr=sr, frame_length=int(frame_length), fmin=fmin, fmax=fmax)[0]
def fundamental_frequency(self, session, cache=True): if not cache or self.f0 is None: data, rate = librosa.load(io.BytesIO(self.content), 48000) data = data.astype(np.float) f0, _, _ = librosa.pyin(data, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=48000) self.f0 = f0.tobytes() session.commit() return np.frombuffer(self.f0, dtype=np.float64)
def main(): wavfile_dir = Path(args.ljspeech_dir) / "wavs" wavfile_list = list(wavfile_dir.glob('*.wav')) target_dir = Path(args.ljspeech_dir) # Create target dir <LJSpeech_base_dir>/energies and <LJSpeech_base_dir>/pitches if necessary if not Path(target_dir / "energies").exists(): print(f"Creating target directory: {target_dir/'energies'}") Path(target_dir / "energies").mkdir() if not Path(target_dir / "pitches").exists(): print(f"Creating target directory: {target_dir/'pitches'}") Path(target_dir / "pitches").mkdir() if tqdm is not None: wavfile_list = tqdm(wavfile_list) for count, file_ in enumerate(wavfile_list): basename = Path(file_).stem pitch_path = target_dir / "pitches" / f"{basename}.npy" energy_path = target_dir / "energies" / f"{basename}.npy" if pitch_path.exists() and energy_path.exists(): continue audio, sr = librosa.load(file_, sr=22050) # Calculate f0 # Please note that fmin and fmax are good approximates for the speaker in LJSpeech and may not generalize to # other speakers f0, _, _ = librosa.pyin(audio, fmin=80, fmax=800, frame_length=1024, sr=sr, fill_na=0.0) # Save to new file np.save(pitch_path, f0) # Calculate energy stft_amplitude = np.abs( librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024)) energy = np.linalg.norm( stft_amplitude, axis=0) # axis=0 since librosa.stft -> (freq bins, frames) # Save to new file np.save(energy_path, energy) assert energy.shape == f0.shape if tqdm is None and count % 1000 == 0: print(f"Finished processing {count} wav files...") print( f"Finished energy extraction for a total of {len(wavfile_list)} wav files." )
def compute_f0(y, fmin, fmax, frame_length, win_length, hop_length): f0, _, _ = librosa.pyin(y, fmin=fmin, fmax=fmax, fill_na=fmin, frame_length=frame_length, win_length=win_length, hop_length=hop_length) f0 = f0.astype(np.float32) f0 = np.log(f0) f0 = np.expand_dims(f0, -1) return f0
def pitch_estimation(self, wavpath): if os.path.exists(wavpath): y, sr = librosa.load(wavpath) f0, voiced_flag, voiced_probs = librosa.pyin( y, fmin=librosa.note_to_hz('B3'), fmax=librosa.note_to_hz('C5')) f0 = f0[~np.isnan(f0)] times = librosa.times_like(f0) level = optimize.curve_fit(lambda x, b: b, times, np.nan_to_num(f0))[0] pitch = np.around(level[0], decimals=3).astype(float) return pitch
def estimate_entire_root(audio: np.array, sr: int, min_note: str = 'C1', max_note: str = 'C7', frame_length: float = 4096, win_length: ty.Optional[float] = None, length_units: LengthUnit = LengthUnit.samples) -> str: """Get root note of the entire audio array. Parameters ---------- audio : np.array sr : int Samplerate min_note : str, optional Middle C is 'C4' max_note : str, optional frame_length : int, optional Samples by default win_length : ty.Optional[int] Samples by default, None = frame_length/2 length_units : LengthUnit, optional can be samples or ms Returns ------- str: note name """ if length_units is LengthUnit.ms: frame_length = length_convert(frame_length, sr, LengthUnit.samples, LengthUnit.ms) # frame_length = sr * frame_length // 1000 if win_length is not None: # win_length = sr * frame_length // 1000 win_length = length_convert(win_length, sr, LengthUnit.samples, LengthUnit.ms) f0s, v_flag, v_prob = lr.pyin( audio, fmin=lr.note_to_hz(min_note), fmax=lr.note_to_hz(max_note), sr=sr, win_length=None if win_length is None else win_length, frame_length=frame_length, ) clean = f0s[np.logical_not(~v_flag)] # print(list(hz_to_note(f0) for f0 in clean)) median = ty.cast(float, np.median(clean)) return hz_to_note(median)
def analyze_wave(model, wave, path): print(len(wave)) if len(wave) < int(2*44100): image_path = make_plot_wave(wave, path) wave_pred = model_prediction(model, wave) if wave_pred == 0: flash("Your phonation is breathy") if wave_pred == 1: flash("Your phonation is balanced") if wave_pred == 2: flash("Your phonation is pressed") else: chunk_size = 22050 class_array = np.zeros(int(np.ceil(len(wave) // chunk_size)), dtype=int) chunk_lab = np.zeros(int(np.ceil(len(wave) // chunk_size)), dtype=int) for chunk in range(len(chunk_lab)): # within this bucket, compare the average pitch to the start and end pitches # if the standard deviation is within a quarter tone of the mean, then good wavelet = wave[chunk * chunk_size:(chunk + 1) * chunk_size] f0_chunk, voiced_flag, voiced_probs = librosa.pyin(y=wavelet, sr=44100, fmin=librosa.note_to_hz('C3'),fmax=librosa.note_to_hz('C5')) avg = np.nanmean(f0_chunk) std = np.nanstd(f0_chunk) if np.count_nonzero(voiced_flag == False) > 8: chunk_lab[chunk] = 0 # not voiced else: if (avg * (35 / 36) < avg - std) and (avg * (36 / 35) > avg + std): # within a semitone # classify this piece chunk_lab[chunk] = 2 # voiced and stable pitch class_array[chunk] = model_prediction(model, wavelet) else: # don't classify chunk_lab[chunk] = 1 # voiced and pitch change image_path = make_plot_pitches(wave, chunk_lab, chunk_size, class_array, path) return image_path
def estimate_pitch(wav, mel_len, method='pyin', normalize_mean=None, normalize_std=None, n_formants=1): if type(normalize_mean) is float or type(normalize_mean) is list: normalize_mean = torch.tensor(normalize_mean) if type(normalize_std) is float or type(normalize_std) is list: normalize_std = torch.tensor(normalize_std) if method == 'pyin': snd, sr = librosa.load(wav) pitch_mel, voiced_flag, voiced_probs = librosa.pyin( snd, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), frame_length=1024) assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0 pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel) pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0) pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1))) if n_formants > 1: raise NotImplementedError else: raise ValueError pitch_mel = pitch_mel.float() if normalize_mean is not None: assert normalize_std is not None pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std) return pitch_mel
def __getitem__(self, index): sample = self.data[index] # Let's keep audio name and all internal directories in rel_audio_path_as_text_id to avoid any collisions rel_audio_path = Path(sample["audio_filepath"]).relative_to( self.base_data_dir).with_suffix("") rel_audio_path_as_text_id = str(rel_audio_path).replace("/", "_") # Load audio features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() # Load text text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() # Load mel if needed log_mel, log_mel_length = None, None if LogMel in self.sup_data_types_set: mel_path = sample["mel_filepath"] if mel_path is not None and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = self.log_mel_folder / f"{rel_audio_path_as_text_id}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: log_mel = self.get_log_mel(audio) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() # Load durations if needed durations = None if Durations in self.sup_data_types_set: durations = self.durs[index] # Load alignment prior matrix if needed align_prior_matrix = None if AlignPriorMatrix in self.sup_data_types_set: if self.use_beta_binomial_interpolator: mel_len = self.get_log_mel(audio).shape[2] align_prior_matrix = torch.from_numpy( self.beta_binomial_interpolator(mel_len, text_length.item())) else: prior_path = self.align_prior_matrix_folder / f"{rel_audio_path_as_text_id}.pt" if prior_path.exists(): align_prior_matrix = torch.load(prior_path) else: mel_len = self.get_log_mel(audio).shape[2] align_prior_matrix = beta_binomial_prior_distribution( text_length, mel_len) align_prior_matrix = torch.from_numpy(align_prior_matrix) torch.save(align_prior_matrix, prior_path) # Load pitch if needed pitch, pitch_length = None, None if Pitch in self.sup_data_types_set: pitch_path = self.pitch_folder / f"{rel_audio_path_as_text_id}.pt" if pitch_path.exists(): pitch = torch.load(pitch_path).float() else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch).float() torch.save(pitch, pitch_path) if self.pitch_mean is not None and self.pitch_std is not None and self.pitch_norm: pitch -= self.pitch_mean pitch[ pitch == -self. pitch_mean] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std pitch_length = torch.tensor(len(pitch)).long() # Load energy if needed energy, energy_length = None, None if Energy in self.sup_data_types_set: energy_path = self.energy_folder / f"{rel_audio_path_as_text_id}.pt" if energy_path.exists(): energy = torch.load(energy_path).float() else: spec = self.get_spec(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0).float() torch.save(energy, energy_path) energy_length = torch.tensor(len(energy)).long() # Load speaker id if needed speaker_id = None if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() return ( audio, audio_length, text, text_length, log_mel, log_mel_length, durations, align_prior_matrix, pitch, pitch_length, energy, energy_length, speaker_id, )
def __getitem__(self, index): spec = None sample = self.data[index] features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() if isinstance(sample["text_tokens"], str): # If tokenize_text is False for Phone dataset text = sample["text_tokens"] text_length = None else: text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() audio_stem = Path(sample["audio_filepath"]).stem # Load mel if it exists mel_path = sample["mel_filepath"] if mel_path and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = Path(self.supplementary_folder) / f"mel_{audio_stem}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: # disable autocast to get full range of stft values with torch.cuda.amp.autocast(enabled=False): spec = self.stft(audio) # guard is needed for sqrt if grads are passed through guard = CONSTANT # TODO: Enable 0 if not self.use_grads else CONSTANT if spec.dtype in [torch.cfloat, torch.cdouble]: spec = torch.view_as_real(spec) spec = torch.sqrt(spec.pow(2).sum(-1) + guard) mel = torch.matmul(self.fb.to(spec.dtype), spec) log_mel = torch.log( torch.clamp(mel, min=torch.finfo(mel.dtype).tiny)) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() duration_prior = None if text_length is not None: ### Make duration attention prior if not exist in the supplementary folder prior_path = Path(self.supplementary_folder ) / f"pr_tl{text_length}_al_{log_mel_length}.pt" if prior_path.exists(): duration_prior = torch.load(prior_path) else: duration_prior = beta_binomial_prior_distribution( text_length, log_mel_length) duration_prior = torch.from_numpy(duration_prior) torch.save(duration_prior, prior_path) # Load pitch file (F0s) pitch_path = ( Path(self.supplementary_folder) / f"{audio_stem}_pitch_pyin_fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_fl{self.win_length}_hs{self.hop_len}.pt" ) if pitch_path.exists(): pitch = torch.load(pitch_path) else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch) torch.save(pitch, pitch_path) # Standize pitch pitch -= self.pitch_avg pitch[pitch == -self. pitch_avg] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std # Load energy file (L2-norm of the amplitude of each STFT frame of an utterance) energy_path = Path( self.supplementary_folder ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt" if energy_path.exists(): energy = torch.load(energy_path) else: if spec is None: spec = self.stft(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0) # Save to new file torch.save(energy, energy_path) return text, text_length, log_mel, log_mel_length, audio, audio_length, duration_prior, pitch, energy
def add_session_data(df_features, labels_df, emotion_dict, audio_vectors_path, sess, columns): audio_vectors = pickle.load(open(audio_vectors_path, 'rb')) for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains( 'Ses0{}'.format(sess))].iterrows()): try: wav_file_name = row['wav_file'] label = emotion_dict[row['emotion']] y = audio_vectors[wav_file_name] feature_list = [wav_file_name, label] # wav_file, label sig_mean = np.mean(abs(y)) feature_list.append(sig_mean) # sig_mean feature_list.append(np.std(y)) # sig_std rmse = librosa.feature.rms(y + 0.0001)[0] feature_list.append(np.mean(rmse)) # rmse_mean feature_list.append(np.std(rmse)) # rmse_std silence = 0 for e in rmse: if e <= 0.4 * np.mean(rmse): silence += 1 silence /= float(len(rmse)) feature_list.append(silence) # silence y_harmonic = librosa.effects.hpss(y)[0] feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000) # based on the pitch detection algorithm mentioned here: # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001 cl = 0.45 * sig_mean center_clipped = [] for s in y: if s >= cl: center_clipped.append(s - cl) elif s <= -cl: center_clipped.append(s + cl) elif np.abs(s) < cl: center_clipped.append(0) p3 = time.time() #auto_corrs = librosa.core.autocorrelate(np.array(center_clipped)) pitch, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) pitch = [0 if math.isnan(p) else p for p in pitch] p4 = time.time() print("audio size: {}, pitch:{}".format( len(y) / 44100.0, (p4 - p3))) feature_list.append(np.mean(pitch)) feature_list.append(np.std(pitch)) #feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000) #feature_list.append(np.std(auto_corrs)) # auto_corr_std df_features = df_features.append(pd.DataFrame( feature_list, index=columns).transpose(), ignore_index=True) except Exception as e: print('Some exception occured: {}'.format(e)) return df_features
import sounddevice as sd from scipy.io.wavfile import write import librosa import librosa.display import numpy as np fs = 44100 #Sample rate seconds = 4 #duration of recording print('start recording') myrecording = sd.rec(int(seconds*fs), samplerate=fs, channels=1) sd.wait() #wait until recording is finished print('finished recording') write('output.wav', fs, myrecording) #save as wav file y, sr =librosa.load('output.wav') f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('E2'), fmax=librosa.note_to_hz('E4')) times = librosa.times_like(f0) import matplotlib.pyplot as plt D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) fig, ax = plt.subplots() img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax) ax.set(title='pYIN fundamental frequency estimation') fig.colorbar(img, ax=ax, format="%+2.f dB") ax.plot(times, f0, label='f0', color='cyan', linewidth=3) ax.legend(loc='upper right')
Audio(data=y, rate=sr) # %% # Sonifying pitch estimates # ------------------------- # As a slightly more advanced example, we can # use sonification to directly observe the output of a # fundamental frequency estimator. # # We'll do this using `librosa.pyin` for analysis, # and `mir_eval.sonify.pitch_contour` for synthesis. # Using fill_na=None retains the best-guess f0 at unvoiced frames f0, voiced_flag, voiced_probs = librosa.pyin(y, sr=sr, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), fill_na=None) # To synthesize the f0, we'll need sample times times = librosa.times_like(f0) # %% # mir_eval's synthesizer uses negative f0 values to indicate # unvoiced regions. # # We'll make an array vneg which is 1 for voiced frames, and # -1 for unvoiced frames. # This way, `f0 * vneg` will leave voiced estimates unchanged, # and negate the frequency for unvoiced frames. vneg = (-1)**(~voiced_flag)
def probabilities(y, note_min, note_max, sr, frame_length, window_length, hop_length, pitch_acc, voiced_acc, onset_acc, spread): """ Estimate prior (observed) probabilities from audio signal Parameters ---------- y : 1-D numpy array Array containing audio samples note_min : string, 'A#4' format Lowest note supported by this estimator note_max : string, 'A#4' format Highest note supported by this estimator sr : int Sample rate. frame_length : int window_length : int hop_length : int Parameters for FFT estimation pitch_acc : float, between 0 and 1 Probability (estimated) that the pitch estimator is correct. voiced_acc : float, between 0 and 1 Estimated accuracy of the "voiced" parameter. onset_acc : float, between 0 and 1 Estimated accuracy of the onset detector. spread : float, between 0 and 1 Probability that the singer/musician had a one-semitone deviation due to vibrato or glissando. Returns ------- P : 2D numpy array. P[j,t] is the prior probability of being in state j at time t. """ fmin = librosa.note_to_hz(note_min) fmax = librosa.note_to_hz(note_max) midi_min = librosa.note_to_midi(note_min) midi_max = librosa.note_to_midi(note_max) n_notes = midi_max - midi_min + 1 # F0 and voicing f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin * 0.9, fmax * 1.1, sr, frame_length, window_length, hop_length) tuning = librosa.pitch_tuning(f0) f0_ = np.round(librosa.hz_to_midi(f0 - tuning)).astype(int) onsets = librosa.onset.onset_detect(y, sr=sr, hop_length=hop_length, backtrack=True) P = np.ones((n_notes * 2 + 1, len(f0))) for t in range(len(f0)): # probability of silence or onset = 1-voiced_prob # Probability of a note = voiced_prob * (pitch_acc) (estimated note) # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note) if voiced_flag[t] == False: P[0, t] = voiced_acc else: P[0, t] = 1 - voiced_acc for j in range(n_notes): if t in onsets: P[(j * 2) + 1, t] = onset_acc else: P[(j * 2) + 1, t] = 1 - onset_acc if j + midi_min == f0_[t]: P[(j * 2) + 2, t] = pitch_acc elif np.abs(j + midi_min - f0_[t]) == 1: P[(j * 2) + 2, t] = pitch_acc * spread else: P[(j * 2) + 2, t] = 1 - pitch_acc return P
def make_plot_pitches(wave, chunk_labels, chunk_size, class_array, path, sr=44100): plt.clf() f0, voiced_flag, voiced_probs = librosa.pyin(y=wave, sr=sr, fmin=librosa.note_to_hz('C3'), fmax=librosa.note_to_hz('C5')) times = librosa.times_like(f0, sr=44100) max_f = np.nanmax(f0) min_f = np.nanmin(f0) ref_values = 261.3 * (2.0**np.linspace(-2.0, 2.0, 2 * 24)) ref_ticks = librosa.hz_to_note(261.3 * (2.0**np.linspace(-2.0, 2.0, 2 * 24))) plt.plot(times, f0, color='k') plt.fill_between(times, 35 / 36 * f0, 36 / 35 * f0, color='lightgrey') for chunk in range(len(chunk_labels)): # print(chunk) alpha = 1.0 if chunk_labels[chunk] == 0: col = 'w' if chunk_labels[chunk] == 1: col = 'w' # 'r' if chunk_labels[chunk] == 2: if class_array[chunk] == 0: col = 'gold' if class_array[chunk] == 1: col = 'lightblue' #'tab:blue'# if class_array[chunk] == 2: col = 'red' #'red' alpha = 0.85 plt.axvspan(chunk * chunk_size / sr, (chunk + 1) * chunk_size / sr, facecolor=col, alpha=alpha, zorder=-1) for value in ref_values[np.logical_and( (ref_values < (18 / 17) * max_f), (ref_values > min_f * (17 / 18)))]: plt.axhline(value, color='k', linestyle=':', linewidth=0.5) plt.semilogy() ax = plt.gca() ax.yaxis.set_minor_formatter(matplotlib.ticker.NullFormatter()) plt.yticks(ref_values, ref_ticks) ax.set_xlim(left=-0.01) ax.set_ylim(top=(18 / 17) * max_f, bottom=min_f * (17 / 18)) ax.set_xlabel('time [s]', weight='bold') ax.set_xticks(range(0, int(np.ceil(times[-1]) + 1))) figc = plt.gcf() figc.patch.set_facecolor('gainsboro') #whitesmoke figc.patch.set_alpha(0.20) figc.tight_layout() figc.subplots_adjust(bottom=0.15) print('saving_plot!') print(path) #plt.savefig(os.path.join(path, 'pitch_plot.png'),dpi=300 ) plt.savefig(os.path.join(path, 'pitch_plot.svg'), format='svg', dpi=300) return 'pitch_plot.svg'
def predict(sample_file, target_midi): # data is in numpy train_loader, test_loader, validate_loader = load_dataset() model = PitchGRU() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) loss = nn.CrossEntropyLoss() model.to(device) train_and_test = Train_And_Test(model, optimizer, loss, train_loader, test_loader, validate_loader) model = train_and_test.load_model( 'models/rnn-pitch-estimation-21-81-0.001-32-max-accuracy.pt') target_pitch = librosa.midi_to_hz(target_midi) print('Target = ', target_pitch) print('-----------------------------------------------------------') print('Begin predictions - ') padded_sample = np.zeros(64000) fig, ax = plt.subplots(figsize=(15, 15)) camera = Camera(fig) SMALL_SIZE = 8 MEDIUM_SIZE = 10 BIGGER_SIZE = 12 plt.rc('font', size=MEDIUM_SIZE) # controls default text sizes plt.rc('axes', titlesize=BIGGER_SIZE) # fontsize of the axes title plt.rc('axes', labelsize=BIGGER_SIZE) # fontsize of the x and y labels plt.rc('xtick', labelsize=BIGGER_SIZE) # fontsize of the tick labels plt.rc('ytick', labelsize=BIGGER_SIZE) # fontsize of the tick labels plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title plt.xlim(0, 64000) def numfmt(x, pos): # your custom formatter function: divide by 16000.0 s = '{}'.format(x / 16000.0) return s yfmt = matplotlib.ticker.FuncFormatter(numfmt) plt.gca().xaxis.set_major_formatter(yfmt) for i in range(31): start_ts = i * 0.128 data, _ = librosa.load(sample_file, sr=16000, offset=start_ts, duration=0.128) audio_data_stft = librosa.stft(data, n_fft=(2048 - 1) * 2) sample = np.concatenate([ np.reshape(data, (2048, 1)) * 2, np.abs(audio_data_stft), np.angle(audio_data_stft) ], axis=1) sample = torch.from_numpy(sample) sample = sample.float() sample = sample.to(device) sample = sample.view(-1, sample.shape[0], 7) # batch_size X 2048 X 1 prediction = -1 model.eval() with torch.no_grad(): output, hidden = model( sample) # Output Shape = batch_size, 1, 88 (num pitches) output = output[:, int(sample_length / 2), :] output = output.view(-1, len(classes)) prediction = librosa.midi_to_hz(classes[output.argmax(dim=1)]) pyin_f0, _, _ = librosa.pyin(data, fmin=librosa.note_to_hz('C0'), fmax=librosa.note_to_hz('C8'), sr=16000, frame_length=1000, hop_length=1000) pyin_f0 = np.nan_to_num(pyin_f0) pyin_f0 = np.max(pyin_f0) data_tensor = torch.from_numpy(data).view(1, -1) torchaudio_f0 = torchaudio.functional.detect_pitch_frequency( data_tensor, sample_rate=16000, frame_time=0.008, freq_low=40).item() crepe_f0 = torchcrepe.predict(data_tensor, 16000, 2048, 40, 3400, 'full', batch_size=1, device=device) crepe_f0 = np.mean(crepe_f0.numpy()) print( 'At start time {:.2f} seconds - GRU Prediction={:.2f}Hz; pYIN={:.2f}Hz; torchaudio={:.2f}Hz; Torch Crepe={:.2f}Hz' .format(start_ts, prediction, pyin_f0, torchaudio_f0, crepe_f0)) padded_sample[i * sample_length:(i * sample_length) + sample_length] = data plt.plot(padded_sample, color='blue', alpha=0.5) ax.text(0.1, 1.01, " ", transform=ax.transAxes, fontsize='x-large') ax.text(0.1, 1.01, "Target pitch = {:.2f}Hz, \ \nGRU predicted pitch = {:.2f}Hz, \ \npYIN = {:.2f}Hz, \ \nTorch Audio = {:.2f}Hz, \ \nTorch Crepe = {:.2f}Hz, ".format( target_pitch, prediction, pyin_f0, torchaudio_f0, crepe_f0), transform=ax.transAxes, fontsize='xx-large') camera.snap() animation = camera.animate() animation.save('animation-{}.gif'.format(target_midi)) print('-------------------------------------------------------------')
def __getitem__(self, index): sample = self.data[index] audio_stem = Path(sample["audio_filepath"]).stem features = self.featurizer.process(sample["audio_filepath"], trim=self.trim) audio, audio_length = features, torch.tensor(features.shape[0]).long() text = torch.tensor(sample["text_tokens"]).long() text_length = torch.tensor(len(sample["text_tokens"])).long() log_mel, log_mel_length = None, None if LogMel in self.sup_data_types_set: mel_path = sample["mel_filepath"] if mel_path is not None and Path(mel_path).exists(): log_mel = torch.load(mel_path) else: mel_path = Path(self.sup_data_path) / f"mel_{audio_stem}.pt" if mel_path.exists(): log_mel = torch.load(mel_path) else: log_mel = self.get_log_mel(audio) torch.save(log_mel, mel_path) log_mel = log_mel.squeeze(0) log_mel_length = torch.tensor(log_mel.shape[1]).long() durations = None if Durations in self.sup_data_types_set: durations = self.durs[index] duration_prior = None if DurationPrior in self.sup_data_types_set: if self.use_beta_binomial_interpolator: mel_len = self.get_log_mel(audio).shape[2] duration_prior = torch.from_numpy( self.beta_binomial_interpolator(mel_len, text_length.item())) else: prior_path = Path(self.sup_data_path) / f"pr_{audio_stem}.pt" if prior_path.exists(): duration_prior = torch.load(prior_path) else: mel_len = self.get_log_mel(audio).shape[2] duration_prior = beta_binomial_prior_distribution( text_length, mel_len) duration_prior = torch.from_numpy(duration_prior) torch.save(duration_prior, prior_path) pitch, pitch_length = None, None if Pitch in self.sup_data_types_set: pitch_name = (f"{audio_stem}_pitch_pyin_" f"fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_" f"fl{self.win_length}_hs{self.hop_len}.pt") pitch_path = Path(self.sup_data_path) / pitch_name if pitch_path.exists(): pitch = torch.load(pitch_path).float() else: pitch, _, _ = librosa.pyin( audio.numpy(), fmin=self.pitch_fmin, fmax=self.pitch_fmax, frame_length=self.win_length, sr=self.sample_rate, fill_na=0.0, ) pitch = torch.from_numpy(pitch).float() torch.save(pitch, pitch_path) if self.pitch_avg is not None and self.pitch_std is not None and self.pitch_norm: pitch -= self.pitch_avg pitch[ pitch == -self. pitch_avg] = 0.0 # Zero out values that were perviously zero pitch /= self.pitch_std pitch_length = torch.tensor(len(pitch)).long() energy, energy_length = None, None if Energy in self.sup_data_types_set: energy_path = Path( self.sup_data_path ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt" if energy_path.exists(): energy = torch.load(energy_path).float() else: spec = self.get_spec(audio) energy = torch.linalg.norm(spec.squeeze(0), axis=0).float() torch.save(energy, energy_path) energy_length = torch.tensor(len(energy)).long() speaker_id = None if SpeakerID in self.sup_data_types_set: speaker_id = torch.tensor(sample["speaker_id"]).long() return ( audio, audio_length, text, text_length, log_mel, log_mel_length, durations, duration_prior, pitch, pitch_length, energy, energy_length, speaker_id, )
def get_first_null_f0(items_handler: ItemsHandler, start_offset: float, min_duration: float, end_offset: ty.Optional[float] = None, min_note: str = 'C1', max_note: str = 'C7', frame_length: float = 2048, win_length: ty.Optional[float] = None, offset_units: LengthUnit = LengthUnit.ms, length_units: LengthUnit = LengthUnit.samples) -> float: audio = items_handler.load_audio()[0] sr = items_handler.sr if length_units != LengthUnit.samples: if length_units != LengthUnit.ms: raise TypeError('length_units can be only of ms or samples') frame_length = length_convert(frame_length, sr, length_units, LengthUnit.samples) if win_length: win_length = length_convert(win_length, sr, length_units, LengthUnit.samples) hop_length = int(frame_length // 4) start_offset_int = ty.cast( int, length_convert(start_offset, sr, offset_units, LengthUnit.samples)) if start_offset_int: audio = audio[start_offset_int:] # type:ignore if end_offset: end_offset_int = ty.cast( int, length_convert(end_offset, sr, offset_units, LengthUnit.samples)) audio = audio[:end_offset_int - start_offset_int] # type:ignore min_duration_frms = length_convert(min_duration, sr, offset_units, LengthUnit.frames, hop_length=hop_length) fmin, fmax = lr.note_to_hz(min_note), lr.note_to_hz(max_note) f0s, v_flag, v_prob = lr.pyin( audio, fmin=fmin, fmax=fmax, sr=sr, win_length=None if win_length is None else win_length, frame_length=frame_length, ) # print(list(zip(f0s, v_flag))) nulls = np.where(~v_flag) # print(nulls) for idx, val in enumerate(nulls[0]): # print(val) if val >= min_duration_frms: # print(val, v_flag[val + 1]) if v_flag[val + 1]: # print(f'skipping {val}') continue break if val < 5: raise PitchError( f'Cannot find null f0 at the reasonable frame (>=5): {v_flag}') val_normalized = length_convert(val, sr, LengthUnit.frames, offset_units, hop_length=hop_length) # print(val_normalized, ) return start_offset + val_normalized