Ejemplo n.º 1
0
def strikes_and_notes(path):
    y, fs = librosa.core.load(path, offset=0.0, duration=None)
    t = librosa.times_like(y, sr=fs)

    strikes = librosa.onset.onset_detect(y, sr=fs, units='samples')

    played_times = []
    played_notes = []

    for i in range(len(strikes)):
        if i == len(strikes) - 1:
            window = y[strikes[i]:min(len(y), 2 * strikes[i] - strikes[i - 1])]
        else:
            window = y[strikes[i]:strikes[i + 1]]

        f0, voiced_flag, voiced_probs = librosa.pyin(
            window,
            fmin=librosa.note_to_hz('C2'),
            fmax=librosa.note_to_hz('C7'),
            fill_na=None)
        f0_est = np.median(f0[~np.isnan(f0)])

        if ~np.isnan(f0_est):
            played_notes.append(f0_est)
            played_times.append(librosa.samples_to_time(strikes, sr=fs))

    return played_times, played_notes
Ejemplo n.º 2
0
    def process(self):
        try:
            audioFilePath = self.args["file_path"]
            sr, sig = wavread(audioFilePath)
            if sr != 22500:
                sr = 22500
            sig = resample(sig, sr)

            pitches, voiced_flag, voiced_probs = librosa.pyin(
                sig,
                fmin=librosa.note_to_hz('C2'),
                fmax=librosa.note_to_hz('C7')
            )  # recommended settings from docs are bad, need to figure out good ones
            #pitches = pitches[pitches != 0]
            yin_min_pitch = np.nanmin(pitches).item()
            yin_max_pitch = np.nanmax(pitches).item()
            yin_mean_pitch = np.nanmean(pitches).item()
            yin_median_pitch = np.nanmedian(pitches).item()
            print("yin_median_pitch")
            print("yin pitches", pitches)
            return {
                'min_pitch_yin': yin_min_pitch,
                'max_pitch_yin': yin_max_pitch,
                'mean_pitch_yin': yin_mean_pitch,
                'median_pitch_yin': yin_median_pitch,
            }
        except Exception as e:
            print(e)
            return {
                'min_pitch_yin': "Measure Pitch Yin Failed",
                'max_pitch_yin': "Measure Pitch Yin Failed",
                'mean_pitch_yin': "Measure Pitch Yin Failed",
                'median_pitch_yin': "Measure Pitch Yin Failed",
            }
def test_pitch_shift_transform_with_pitch_detection():
    """To check semi-tone values, check: http://www.homepages.ucl.ac.uk/~sslyjjt/speech/semitone.html"""

    source_frequency = 440
    max_semitone_shift = 4
    expected_frequency_shift = 554

    num_channels = 1
    audio = generate_waveform(sample_rate,
                              num_samples,
                              num_channels,
                              frequency=source_frequency)
    pitch_shift = PitchShift(
        n_samples=num_samples,
        sample_rate=sample_rate,
        pitch_shift_min=max_semitone_shift,
        pitch_shift_max=max_semitone_shift + 1,
    )

    t_audio = pitch_shift(audio)
    librosa_audio = t_audio[0].numpy()
    f0_hz, _, _ = librosa.pyin(librosa_audio, fmin=10, fmax=1000)

    # remove nan values:
    f0_hz = f0_hz[~np.isnan(f0_hz)]

    detected_f0_hz = np.max(f0_hz)

    detection_threshold_in_hz = 40
    # the detected frequency vs. expected frequency should not be smaller than 40Hz.
    assert abs(detected_f0_hz -
               expected_frequency_shift) < detection_threshold_in_hz
Ejemplo n.º 4
0
def extract_feature(audio, sr=44100):
    """
    extract feature like below:
    sig:
    rmse:
    silence:
    harmonic:
    pitch:

    audio: audio file or audio list
    return feature_list: np of [n_samples, n_features]
    """
    feature_list = []
    y = []
    if isinstance(audio, str):
        y, _ = librosa.load(audio, sr)
    elif isinstance(audio, np.ndarray):
        y = audio
    # 1. sig
    sig_mean = np.mean(abs(y))
    feature_list.append(sig_mean)  # sig_mean
    feature_list.append(np.std(y))  # sig_std

    # 2. rmse
    rmse = librosa.feature.rms(y + 0.0001)[0]
    feature_list.append(np.mean(rmse))  # rmse_mean
    feature_list.append(np.std(rmse))  # rmse_std

    # 3. silence
    silence = 0
    for e in rmse:
        if e <= 0.4 * np.mean(rmse):
            silence += 1
    silence /= float(len(rmse))
    feature_list.append(silence)  # silence

    # 4. harmonic
    y_harmonic = librosa.effects.hpss(y)[0]
    feature_list.append(np.mean(y_harmonic) *
                        1000)  # harmonic (scaled by 1000)

    # 5. pitch (instead of auto_correlation)
    cl = 0.45 * sig_mean
    center_clipped = []
    for s in y:
        if s >= cl:
            center_clipped.append(s - cl)
        elif s <= -cl:
            center_clipped.append(s + cl)
        elif np.abs(s) < cl:
            center_clipped.append(0)
    # auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
    pitch, _, _ = librosa.pyin(y,
                               fmin=librosa.note_to_hz('C2'),
                               fmax=librosa.note_to_hz('C7'))
    pitch = [0 if math.isnan(p) else p for p in pitch]
    feature_list.append(np.mean(pitch))
    feature_list.append(np.std(pitch))

    return np.array(feature_list).reshape(1, -1)
Ejemplo n.º 5
0
def get_f0_series(snd_filename, fmin=VIOLIN_MIN_F,
                     fmax=VIOLIN_MAX_F):
    """Extract f0 history from sound file."""
    y, sr = librosa.load(snd_filename)
    f0, voiced_flag, voiced_probs = \
        librosa.pyin(y, fmin=VIOLIN_MIN_F, fmax=VIOLIN_MAX_F)

    return y, f0
Ejemplo n.º 6
0
def wav2f0(y, sr):
    f0, voiced_flag, voiced_probs = librosa.pyin(y,
                                                 fmin=librosa.note_to_hz('C2'),
                                                 fmax=librosa.note_to_hz('C6'))
    # f0 = np.nan_to_num(f0) # get rid of nans
    f0_times = librosa.times_like(f0)
    # D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    return f0, voiced_flag, voiced_probs, f0_times
Ejemplo n.º 7
0
def track_pitch(x, sr, frame_length, fmin=165, fmax=1500):
    x = array(x)
    x_arr = x / max(abs(x))
    return librosa.pyin(x_arr,
                        sr=sr,
                        frame_length=int(frame_length),
                        fmin=fmin,
                        fmax=fmax)[0]
Ejemplo n.º 8
0
 def fundamental_frequency(self, session, cache=True):
     if not cache or self.f0 is None:
         data, rate = librosa.load(io.BytesIO(self.content), 48000)
         data = data.astype(np.float)
         f0, _, _ = librosa.pyin(data,
                                 fmin=librosa.note_to_hz('C2'),
                                 fmax=librosa.note_to_hz('C7'),
                                 sr=48000)
         self.f0 = f0.tobytes()
         session.commit()
     return np.frombuffer(self.f0, dtype=np.float64)
Ejemplo n.º 9
0
def main():
    wavfile_dir = Path(args.ljspeech_dir) / "wavs"
    wavfile_list = list(wavfile_dir.glob('*.wav'))

    target_dir = Path(args.ljspeech_dir)
    # Create target dir <LJSpeech_base_dir>/energies and <LJSpeech_base_dir>/pitches if necessary
    if not Path(target_dir / "energies").exists():
        print(f"Creating target directory: {target_dir/'energies'}")
        Path(target_dir / "energies").mkdir()
    if not Path(target_dir / "pitches").exists():
        print(f"Creating target directory: {target_dir/'pitches'}")
        Path(target_dir / "pitches").mkdir()

    if tqdm is not None:
        wavfile_list = tqdm(wavfile_list)
    for count, file_ in enumerate(wavfile_list):
        basename = Path(file_).stem
        pitch_path = target_dir / "pitches" / f"{basename}.npy"
        energy_path = target_dir / "energies" / f"{basename}.npy"
        if pitch_path.exists() and energy_path.exists():
            continue
        audio, sr = librosa.load(file_, sr=22050)

        # Calculate f0
        # Please note that fmin and fmax are good approximates for the speaker in LJSpeech and may not generalize to
        # other speakers
        f0, _, _ = librosa.pyin(audio,
                                fmin=80,
                                fmax=800,
                                frame_length=1024,
                                sr=sr,
                                fill_na=0.0)

        # Save to new file
        np.save(pitch_path, f0)

        # Calculate energy
        stft_amplitude = np.abs(
            librosa.stft(audio, n_fft=1024, hop_length=256, win_length=1024))
        energy = np.linalg.norm(
            stft_amplitude,
            axis=0)  # axis=0 since librosa.stft -> (freq bins, frames)

        # Save to new file
        np.save(energy_path, energy)

        assert energy.shape == f0.shape
        if tqdm is None and count % 1000 == 0:
            print(f"Finished processing {count} wav files...")

    print(
        f"Finished energy extraction for a total of {len(wavfile_list)} wav files."
    )
Ejemplo n.º 10
0
def compute_f0(y, fmin, fmax, frame_length, win_length, hop_length):
    f0, _, _ = librosa.pyin(y,
                            fmin=fmin,
                            fmax=fmax,
                            fill_na=fmin,
                            frame_length=frame_length,
                            win_length=win_length,
                            hop_length=hop_length)
    f0 = f0.astype(np.float32)
    f0 = np.log(f0)
    f0 = np.expand_dims(f0, -1)
    return f0
Ejemplo n.º 11
0
 def pitch_estimation(self, wavpath):
     if os.path.exists(wavpath):
         y, sr = librosa.load(wavpath)
         f0, voiced_flag, voiced_probs = librosa.pyin(
             y,
             fmin=librosa.note_to_hz('B3'),
             fmax=librosa.note_to_hz('C5'))
         f0 = f0[~np.isnan(f0)]
         times = librosa.times_like(f0)
         level = optimize.curve_fit(lambda x, b: b, times,
                                    np.nan_to_num(f0))[0]
         pitch = np.around(level[0], decimals=3).astype(float)
     return pitch
Ejemplo n.º 12
0
def estimate_entire_root(audio: np.array,
                         sr: int,
                         min_note: str = 'C1',
                         max_note: str = 'C7',
                         frame_length: float = 4096,
                         win_length: ty.Optional[float] = None,
                         length_units: LengthUnit = LengthUnit.samples) -> str:
    """Get root note of the entire audio array.

    Parameters
    ----------
    audio : np.array
    sr : int
        Samplerate
    min_note : str, optional
        Middle C is 'C4'
    max_note : str, optional
    frame_length : int, optional
        Samples by default
    win_length : ty.Optional[int]
        Samples by default, None = frame_length/2
    length_units : LengthUnit, optional
        can be samples or ms

    Returns
    -------
    str: note name
    """
    if length_units is LengthUnit.ms:
        frame_length = length_convert(frame_length, sr, LengthUnit.samples,
                                      LengthUnit.ms)
        # frame_length = sr * frame_length // 1000
        if win_length is not None:
            # win_length = sr * frame_length // 1000
            win_length = length_convert(win_length, sr, LengthUnit.samples,
                                        LengthUnit.ms)

    f0s, v_flag, v_prob = lr.pyin(
        audio,
        fmin=lr.note_to_hz(min_note),
        fmax=lr.note_to_hz(max_note),
        sr=sr,
        win_length=None if win_length is None else win_length,
        frame_length=frame_length,
    )

    clean = f0s[np.logical_not(~v_flag)]
    # print(list(hz_to_note(f0) for f0 in clean))
    median = ty.cast(float, np.median(clean))
    return hz_to_note(median)
Ejemplo n.º 13
0
def analyze_wave(model, wave, path):

    print(len(wave))

    if len(wave) < int(2*44100):

        image_path = make_plot_wave(wave, path)

        wave_pred = model_prediction(model, wave)

        if wave_pred == 0:
            flash("Your phonation is breathy")
        if wave_pred == 1:
            flash("Your phonation is balanced")
        if wave_pred == 2:
            flash("Your phonation is pressed")

    else:

        chunk_size = 22050
        class_array = np.zeros(int(np.ceil(len(wave) // chunk_size)), dtype=int)
        chunk_lab = np.zeros(int(np.ceil(len(wave) // chunk_size)), dtype=int)

        for chunk in range(len(chunk_lab)):
            # within this bucket, compare the average pitch to the start and end pitches
            # if the standard deviation is within a quarter tone of the mean, then good

            wavelet = wave[chunk * chunk_size:(chunk + 1) * chunk_size]

            f0_chunk, voiced_flag, voiced_probs = librosa.pyin(y=wavelet, sr=44100, fmin=librosa.note_to_hz('C3'),fmax=librosa.note_to_hz('C5'))
            avg = np.nanmean(f0_chunk)
            std = np.nanstd(f0_chunk)

            if np.count_nonzero(voiced_flag == False) > 8:
                chunk_lab[chunk] = 0 # not voiced
            else:
                if (avg * (35 / 36) < avg - std) and (avg * (36 / 35) > avg + std):  # within a semitone
                    # classify this piece
                    chunk_lab[chunk] = 2 # voiced and stable pitch
                    class_array[chunk] = model_prediction(model, wavelet)

                else:
                    # don't classify
                    chunk_lab[chunk] = 1 # voiced and pitch change

        image_path = make_plot_pitches(wave, chunk_lab, chunk_size, class_array, path)

    return image_path
Ejemplo n.º 14
0
def estimate_pitch(wav,
                   mel_len,
                   method='pyin',
                   normalize_mean=None,
                   normalize_std=None,
                   n_formants=1):

    if type(normalize_mean) is float or type(normalize_mean) is list:
        normalize_mean = torch.tensor(normalize_mean)

    if type(normalize_std) is float or type(normalize_std) is list:
        normalize_std = torch.tensor(normalize_std)

    if method == 'pyin':

        snd, sr = librosa.load(wav)
        pitch_mel, voiced_flag, voiced_probs = librosa.pyin(
            snd,
            fmin=librosa.note_to_hz('C2'),
            fmax=librosa.note_to_hz('C7'),
            frame_length=1024)
        assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0

        pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel)
        pitch_mel = torch.from_numpy(pitch_mel).unsqueeze(0)
        pitch_mel = F.pad(pitch_mel, (0, mel_len - pitch_mel.size(1)))

        if n_formants > 1:
            raise NotImplementedError

    else:
        raise ValueError

    pitch_mel = pitch_mel.float()

    if normalize_mean is not None:
        assert normalize_std is not None
        pitch_mel = normalize_pitch(pitch_mel, normalize_mean, normalize_std)

    return pitch_mel
Ejemplo n.º 15
0
    def __getitem__(self, index):
        sample = self.data[index]

        # Let's keep audio name and all internal directories in rel_audio_path_as_text_id to avoid any collisions
        rel_audio_path = Path(sample["audio_filepath"]).relative_to(
            self.base_data_dir).with_suffix("")
        rel_audio_path_as_text_id = str(rel_audio_path).replace("/", "_")

        # Load audio
        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()

        # Load text
        text = torch.tensor(sample["text_tokens"]).long()
        text_length = torch.tensor(len(sample["text_tokens"])).long()

        # Load mel if needed
        log_mel, log_mel_length = None, None
        if LogMel in self.sup_data_types_set:
            mel_path = sample["mel_filepath"]

            if mel_path is not None and Path(mel_path).exists():
                log_mel = torch.load(mel_path)
            else:
                mel_path = self.log_mel_folder / f"{rel_audio_path_as_text_id}.pt"

                if mel_path.exists():
                    log_mel = torch.load(mel_path)
                else:
                    log_mel = self.get_log_mel(audio)
                    torch.save(log_mel, mel_path)

            log_mel = log_mel.squeeze(0)
            log_mel_length = torch.tensor(log_mel.shape[1]).long()

        # Load durations if needed
        durations = None
        if Durations in self.sup_data_types_set:
            durations = self.durs[index]

        # Load alignment prior matrix if needed
        align_prior_matrix = None
        if AlignPriorMatrix in self.sup_data_types_set:
            if self.use_beta_binomial_interpolator:
                mel_len = self.get_log_mel(audio).shape[2]
                align_prior_matrix = torch.from_numpy(
                    self.beta_binomial_interpolator(mel_len,
                                                    text_length.item()))
            else:
                prior_path = self.align_prior_matrix_folder / f"{rel_audio_path_as_text_id}.pt"

                if prior_path.exists():
                    align_prior_matrix = torch.load(prior_path)
                else:
                    mel_len = self.get_log_mel(audio).shape[2]
                    align_prior_matrix = beta_binomial_prior_distribution(
                        text_length, mel_len)
                    align_prior_matrix = torch.from_numpy(align_prior_matrix)
                    torch.save(align_prior_matrix, prior_path)

        # Load pitch if needed
        pitch, pitch_length = None, None
        if Pitch in self.sup_data_types_set:
            pitch_path = self.pitch_folder / f"{rel_audio_path_as_text_id}.pt"

            if pitch_path.exists():
                pitch = torch.load(pitch_path).float()
            else:
                pitch, _, _ = librosa.pyin(
                    audio.numpy(),
                    fmin=self.pitch_fmin,
                    fmax=self.pitch_fmax,
                    frame_length=self.win_length,
                    sr=self.sample_rate,
                    fill_na=0.0,
                )
                pitch = torch.from_numpy(pitch).float()
                torch.save(pitch, pitch_path)

            if self.pitch_mean is not None and self.pitch_std is not None and self.pitch_norm:
                pitch -= self.pitch_mean
                pitch[
                    pitch == -self.
                    pitch_mean] = 0.0  # Zero out values that were perviously zero
                pitch /= self.pitch_std

            pitch_length = torch.tensor(len(pitch)).long()

        # Load energy if needed
        energy, energy_length = None, None
        if Energy in self.sup_data_types_set:
            energy_path = self.energy_folder / f"{rel_audio_path_as_text_id}.pt"

            if energy_path.exists():
                energy = torch.load(energy_path).float()
            else:
                spec = self.get_spec(audio)
                energy = torch.linalg.norm(spec.squeeze(0), axis=0).float()
                torch.save(energy, energy_path)

            energy_length = torch.tensor(len(energy)).long()

        # Load speaker id if needed
        speaker_id = None
        if SpeakerID in self.sup_data_types_set:
            speaker_id = torch.tensor(sample["speaker_id"]).long()

        return (
            audio,
            audio_length,
            text,
            text_length,
            log_mel,
            log_mel_length,
            durations,
            align_prior_matrix,
            pitch,
            pitch_length,
            energy,
            energy_length,
            speaker_id,
        )
Ejemplo n.º 16
0
    def __getitem__(self, index):
        spec = None
        sample = self.data[index]

        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()
        if isinstance(sample["text_tokens"], str):
            # If tokenize_text is False for Phone dataset
            text = sample["text_tokens"]
            text_length = None
        else:
            text = torch.tensor(sample["text_tokens"]).long()
            text_length = torch.tensor(len(sample["text_tokens"])).long()
        audio_stem = Path(sample["audio_filepath"]).stem

        # Load mel if it exists
        mel_path = sample["mel_filepath"]
        if mel_path and Path(mel_path).exists():
            log_mel = torch.load(mel_path)
        else:
            mel_path = Path(self.supplementary_folder) / f"mel_{audio_stem}.pt"
            if mel_path.exists():
                log_mel = torch.load(mel_path)
            else:
                # disable autocast to get full range of stft values
                with torch.cuda.amp.autocast(enabled=False):
                    spec = self.stft(audio)

                    # guard is needed for sqrt if grads are passed through
                    guard = CONSTANT  # TODO: Enable 0 if not self.use_grads else CONSTANT
                    if spec.dtype in [torch.cfloat, torch.cdouble]:
                        spec = torch.view_as_real(spec)
                    spec = torch.sqrt(spec.pow(2).sum(-1) + guard)

                    mel = torch.matmul(self.fb.to(spec.dtype), spec)

                    log_mel = torch.log(
                        torch.clamp(mel, min=torch.finfo(mel.dtype).tiny))
                    torch.save(log_mel, mel_path)

        log_mel = log_mel.squeeze(0)
        log_mel_length = torch.tensor(log_mel.shape[1]).long()

        duration_prior = None
        if text_length is not None:
            ### Make duration attention prior if not exist in the supplementary folder
            prior_path = Path(self.supplementary_folder
                              ) / f"pr_tl{text_length}_al_{log_mel_length}.pt"
            if prior_path.exists():
                duration_prior = torch.load(prior_path)
            else:
                duration_prior = beta_binomial_prior_distribution(
                    text_length, log_mel_length)
                duration_prior = torch.from_numpy(duration_prior)
                torch.save(duration_prior, prior_path)

        # Load pitch file (F0s)
        pitch_path = (
            Path(self.supplementary_folder) /
            f"{audio_stem}_pitch_pyin_fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_fl{self.win_length}_hs{self.hop_len}.pt"
        )
        if pitch_path.exists():
            pitch = torch.load(pitch_path)
        else:
            pitch, _, _ = librosa.pyin(
                audio.numpy(),
                fmin=self.pitch_fmin,
                fmax=self.pitch_fmax,
                frame_length=self.win_length,
                sr=self.sample_rate,
                fill_na=0.0,
            )
            pitch = torch.from_numpy(pitch)
            torch.save(pitch, pitch_path)
        # Standize pitch
        pitch -= self.pitch_avg
        pitch[pitch == -self.
              pitch_avg] = 0.0  # Zero out values that were perviously zero
        pitch /= self.pitch_std

        # Load energy file (L2-norm of the amplitude of each STFT frame of an utterance)
        energy_path = Path(
            self.supplementary_folder
        ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt"
        if energy_path.exists():
            energy = torch.load(energy_path)
        else:
            if spec is None:
                spec = self.stft(audio)
            energy = torch.linalg.norm(spec.squeeze(0), axis=0)
            # Save to new file
            torch.save(energy, energy_path)

        return text, text_length, log_mel, log_mel_length, audio, audio_length, duration_prior, pitch, energy
Ejemplo n.º 17
0
def add_session_data(df_features, labels_df, emotion_dict, audio_vectors_path,
                     sess, columns):
    audio_vectors = pickle.load(open(audio_vectors_path, 'rb'))
    for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains(
            'Ses0{}'.format(sess))].iterrows()):
        try:
            wav_file_name = row['wav_file']
            label = emotion_dict[row['emotion']]
            y = audio_vectors[wav_file_name]

            feature_list = [wav_file_name, label]  # wav_file, label
            sig_mean = np.mean(abs(y))
            feature_list.append(sig_mean)  # sig_mean
            feature_list.append(np.std(y))  # sig_std

            rmse = librosa.feature.rms(y + 0.0001)[0]
            feature_list.append(np.mean(rmse))  # rmse_mean
            feature_list.append(np.std(rmse))  # rmse_std

            silence = 0
            for e in rmse:
                if e <= 0.4 * np.mean(rmse):
                    silence += 1
            silence /= float(len(rmse))
            feature_list.append(silence)  # silence

            y_harmonic = librosa.effects.hpss(y)[0]
            feature_list.append(np.mean(y_harmonic) *
                                1000)  # harmonic (scaled by 1000)

            # based on the pitch detection algorithm mentioned here:
            # http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
            cl = 0.45 * sig_mean
            center_clipped = []
            for s in y:
                if s >= cl:
                    center_clipped.append(s - cl)
                elif s <= -cl:
                    center_clipped.append(s + cl)
                elif np.abs(s) < cl:
                    center_clipped.append(0)
            p3 = time.time()
            #auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
            pitch, _, _ = librosa.pyin(y,
                                       fmin=librosa.note_to_hz('C2'),
                                       fmax=librosa.note_to_hz('C7'))
            pitch = [0 if math.isnan(p) else p for p in pitch]
            p4 = time.time()
            print("audio size: {}, pitch:{}".format(
                len(y) / 44100.0, (p4 - p3)))

            feature_list.append(np.mean(pitch))
            feature_list.append(np.std(pitch))
            #feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs))  # auto_corr_max (scaled by 1000)
            #feature_list.append(np.std(auto_corrs))  # auto_corr_std

            df_features = df_features.append(pd.DataFrame(
                feature_list, index=columns).transpose(),
                                             ignore_index=True)
        except Exception as e:
            print('Some exception occured: {}'.format(e))
    return df_features
Ejemplo n.º 18
0
import sounddevice as sd
from scipy.io.wavfile import write
import librosa
import librosa.display
import numpy as np

fs = 44100 #Sample rate
seconds = 4 #duration of recording

print('start recording')
myrecording = sd.rec(int(seconds*fs), samplerate=fs, channels=1)
sd.wait() #wait until recording is finished
print('finished recording')
write('output.wav', fs, myrecording) #save as wav file
y, sr =librosa.load('output.wav')
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('E2'), fmax=librosa.note_to_hz('E4'))
times = librosa.times_like(f0)
import matplotlib.pyplot as plt
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
fig, ax = plt.subplots()
img = librosa.display.specshow(D, x_axis='time', y_axis='log', ax=ax)
ax.set(title='pYIN fundamental frequency estimation')
fig.colorbar(img, ax=ax, format="%+2.f dB")
ax.plot(times, f0, label='f0', color='cyan', linewidth=3)
ax.legend(loc='upper right')
Ejemplo n.º 19
0
Audio(data=y, rate=sr)

# %%
# Sonifying pitch estimates
# -------------------------
# As a slightly more advanced example, we can
# use sonification to directly observe the output of a
# fundamental frequency estimator.
#
# We'll do this using `librosa.pyin` for analysis,
# and `mir_eval.sonify.pitch_contour` for synthesis.

# Using fill_na=None retains the best-guess f0 at unvoiced frames
f0, voiced_flag, voiced_probs = librosa.pyin(y,
                                             sr=sr,
                                             fmin=librosa.note_to_hz('C2'),
                                             fmax=librosa.note_to_hz('C7'),
                                             fill_na=None)

# To synthesize the f0, we'll need sample times
times = librosa.times_like(f0)

# %%
# mir_eval's synthesizer uses negative f0 values to indicate
# unvoiced regions.
#
# We'll make an array vneg which is 1 for voiced frames, and
# -1 for unvoiced frames.
# This way, `f0 * vneg` will leave voiced estimates unchanged,
# and negate the frequency for unvoiced frames.
vneg = (-1)**(~voiced_flag)
Ejemplo n.º 20
0
def probabilities(y, note_min, note_max, sr, frame_length, window_length,
                  hop_length, pitch_acc, voiced_acc, onset_acc, spread):
    """
    Estimate prior (observed) probabilities from audio signal

    Parameters
    ----------
    y : 1-D numpy array
        Array containing audio samples

    note_min : string, 'A#4' format
        Lowest note supported by this estimator
    note_max : string, 'A#4' format
        Highest note supported by this estimator
    sr : int
        Sample rate.
    frame_length : int
    window_length : int
    hop_length : int
        Parameters for FFT estimation
    pitch_acc : float, between 0 and 1
        Probability (estimated) that the pitch estimator is correct.
    voiced_acc : float, between 0 and 1
        Estimated accuracy of the "voiced" parameter.
    onset_acc : float, between 0 and 1
        Estimated accuracy of the onset detector.
    spread : float, between 0 and 1
        Probability that the singer/musician had a one-semitone deviation
        due to vibrato or glissando.
    Returns
    -------
    P : 2D numpy array.
        P[j,t] is the prior probability of being in state j at time t.
    """

    fmin = librosa.note_to_hz(note_min)
    fmax = librosa.note_to_hz(note_max)
    midi_min = librosa.note_to_midi(note_min)
    midi_max = librosa.note_to_midi(note_max)
    n_notes = midi_max - midi_min + 1

    # F0 and voicing
    f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin * 0.9, fmax * 1.1, sr,
                                                frame_length, window_length,
                                                hop_length)
    tuning = librosa.pitch_tuning(f0)
    f0_ = np.round(librosa.hz_to_midi(f0 - tuning)).astype(int)
    onsets = librosa.onset.onset_detect(y,
                                        sr=sr,
                                        hop_length=hop_length,
                                        backtrack=True)

    P = np.ones((n_notes * 2 + 1, len(f0)))

    for t in range(len(f0)):
        # probability of silence or onset = 1-voiced_prob
        # Probability of a note = voiced_prob * (pitch_acc) (estimated note)
        # Probability of a note = voiced_prob * (1-pitch_acc) (estimated note)
        if voiced_flag[t] == False:
            P[0, t] = voiced_acc
        else:
            P[0, t] = 1 - voiced_acc

        for j in range(n_notes):
            if t in onsets:
                P[(j * 2) + 1, t] = onset_acc
            else:
                P[(j * 2) + 1, t] = 1 - onset_acc

            if j + midi_min == f0_[t]:
                P[(j * 2) + 2, t] = pitch_acc

            elif np.abs(j + midi_min - f0_[t]) == 1:
                P[(j * 2) + 2, t] = pitch_acc * spread

            else:
                P[(j * 2) + 2, t] = 1 - pitch_acc

    return P
Ejemplo n.º 21
0
def make_plot_pitches(wave,
                      chunk_labels,
                      chunk_size,
                      class_array,
                      path,
                      sr=44100):

    plt.clf()

    f0, voiced_flag, voiced_probs = librosa.pyin(y=wave,
                                                 sr=sr,
                                                 fmin=librosa.note_to_hz('C3'),
                                                 fmax=librosa.note_to_hz('C5'))
    times = librosa.times_like(f0, sr=44100)

    max_f = np.nanmax(f0)
    min_f = np.nanmin(f0)

    ref_values = 261.3 * (2.0**np.linspace(-2.0, 2.0, 2 * 24))
    ref_ticks = librosa.hz_to_note(261.3 *
                                   (2.0**np.linspace(-2.0, 2.0, 2 * 24)))

    plt.plot(times, f0, color='k')
    plt.fill_between(times, 35 / 36 * f0, 36 / 35 * f0, color='lightgrey')

    for chunk in range(len(chunk_labels)):
        # print(chunk)
        alpha = 1.0
        if chunk_labels[chunk] == 0:
            col = 'w'
        if chunk_labels[chunk] == 1:
            col = 'w'  # 'r'
        if chunk_labels[chunk] == 2:
            if class_array[chunk] == 0:
                col = 'gold'

            if class_array[chunk] == 1:
                col = 'lightblue'  #'tab:blue'#

            if class_array[chunk] == 2:
                col = 'red'  #'red'
                alpha = 0.85

        plt.axvspan(chunk * chunk_size / sr, (chunk + 1) * chunk_size / sr,
                    facecolor=col,
                    alpha=alpha,
                    zorder=-1)

        for value in ref_values[np.logical_and(
            (ref_values <
             (18 / 17) * max_f), (ref_values > min_f * (17 / 18)))]:
            plt.axhline(value, color='k', linestyle=':', linewidth=0.5)

        plt.semilogy()
        ax = plt.gca()
        ax.yaxis.set_minor_formatter(matplotlib.ticker.NullFormatter())
        plt.yticks(ref_values, ref_ticks)

        ax.set_xlim(left=-0.01)
        ax.set_ylim(top=(18 / 17) * max_f, bottom=min_f * (17 / 18))
        ax.set_xlabel('time [s]', weight='bold')
        ax.set_xticks(range(0, int(np.ceil(times[-1]) + 1)))

    figc = plt.gcf()
    figc.patch.set_facecolor('gainsboro')  #whitesmoke
    figc.patch.set_alpha(0.20)
    figc.tight_layout()
    figc.subplots_adjust(bottom=0.15)
    print('saving_plot!')
    print(path)
    #plt.savefig(os.path.join(path, 'pitch_plot.png'),dpi=300 )
    plt.savefig(os.path.join(path, 'pitch_plot.svg'), format='svg', dpi=300)

    return 'pitch_plot.svg'
Ejemplo n.º 22
0
def predict(sample_file, target_midi):  # data is in numpy

    train_loader, test_loader, validate_loader = load_dataset()
    model = PitchGRU()
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    loss = nn.CrossEntropyLoss()
    model.to(device)

    train_and_test = Train_And_Test(model, optimizer, loss, train_loader,
                                    test_loader, validate_loader)
    model = train_and_test.load_model(
        'models/rnn-pitch-estimation-21-81-0.001-32-max-accuracy.pt')

    target_pitch = librosa.midi_to_hz(target_midi)
    print('Target = ', target_pitch)
    print('-----------------------------------------------------------')
    print('Begin predictions - ')
    padded_sample = np.zeros(64000)
    fig, ax = plt.subplots(figsize=(15, 15))
    camera = Camera(fig)

    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12

    plt.rc('font', size=MEDIUM_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=BIGGER_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=BIGGER_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=BIGGER_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=BIGGER_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    plt.xlim(0, 64000)

    def numfmt(x, pos):  # your custom formatter function: divide by 16000.0
        s = '{}'.format(x / 16000.0)
        return s

    yfmt = matplotlib.ticker.FuncFormatter(numfmt)
    plt.gca().xaxis.set_major_formatter(yfmt)

    for i in range(31):
        start_ts = i * 0.128

        data, _ = librosa.load(sample_file,
                               sr=16000,
                               offset=start_ts,
                               duration=0.128)
        audio_data_stft = librosa.stft(data, n_fft=(2048 - 1) * 2)
        sample = np.concatenate([
            np.reshape(data, (2048, 1)) * 2,
            np.abs(audio_data_stft),
            np.angle(audio_data_stft)
        ],
                                axis=1)
        sample = torch.from_numpy(sample)
        sample = sample.float()
        sample = sample.to(device)
        sample = sample.view(-1, sample.shape[0], 7)  # batch_size X 2048 X 1

        prediction = -1
        model.eval()
        with torch.no_grad():
            output, hidden = model(
                sample)  # Output Shape = batch_size, 1, 88 (num pitches)
            output = output[:, int(sample_length / 2), :]
            output = output.view(-1, len(classes))
            prediction = librosa.midi_to_hz(classes[output.argmax(dim=1)])

        pyin_f0, _, _ = librosa.pyin(data,
                                     fmin=librosa.note_to_hz('C0'),
                                     fmax=librosa.note_to_hz('C8'),
                                     sr=16000,
                                     frame_length=1000,
                                     hop_length=1000)
        pyin_f0 = np.nan_to_num(pyin_f0)
        pyin_f0 = np.max(pyin_f0)

        data_tensor = torch.from_numpy(data).view(1, -1)
        torchaudio_f0 = torchaudio.functional.detect_pitch_frequency(
            data_tensor, sample_rate=16000, frame_time=0.008,
            freq_low=40).item()

        crepe_f0 = torchcrepe.predict(data_tensor,
                                      16000,
                                      2048,
                                      40,
                                      3400,
                                      'full',
                                      batch_size=1,
                                      device=device)
        crepe_f0 = np.mean(crepe_f0.numpy())
        print(
            'At start time {:.2f} seconds - GRU Prediction={:.2f}Hz; pYIN={:.2f}Hz; torchaudio={:.2f}Hz; Torch Crepe={:.2f}Hz'
            .format(start_ts, prediction, pyin_f0, torchaudio_f0, crepe_f0))

        padded_sample[i * sample_length:(i * sample_length) +
                      sample_length] = data

        plt.plot(padded_sample, color='blue', alpha=0.5)
        ax.text(0.1, 1.01, " ", transform=ax.transAxes, fontsize='x-large')

        ax.text(0.1,
                1.01,
                "Target pitch = {:.2f}Hz, \
                            \nGRU predicted pitch = {:.2f}Hz, \
                            \npYIN = {:.2f}Hz, \
                            \nTorch Audio = {:.2f}Hz, \
                            \nTorch Crepe = {:.2f}Hz, ".format(
                    target_pitch, prediction, pyin_f0, torchaudio_f0,
                    crepe_f0),
                transform=ax.transAxes,
                fontsize='xx-large')
        camera.snap()
    animation = camera.animate()
    animation.save('animation-{}.gif'.format(target_midi))

    print('-------------------------------------------------------------')
Ejemplo n.º 23
0
    def __getitem__(self, index):
        sample = self.data[index]
        audio_stem = Path(sample["audio_filepath"]).stem

        features = self.featurizer.process(sample["audio_filepath"],
                                           trim=self.trim)
        audio, audio_length = features, torch.tensor(features.shape[0]).long()

        text = torch.tensor(sample["text_tokens"]).long()
        text_length = torch.tensor(len(sample["text_tokens"])).long()

        log_mel, log_mel_length = None, None
        if LogMel in self.sup_data_types_set:
            mel_path = sample["mel_filepath"]

            if mel_path is not None and Path(mel_path).exists():
                log_mel = torch.load(mel_path)
            else:
                mel_path = Path(self.sup_data_path) / f"mel_{audio_stem}.pt"

                if mel_path.exists():
                    log_mel = torch.load(mel_path)
                else:
                    log_mel = self.get_log_mel(audio)
                    torch.save(log_mel, mel_path)

            log_mel = log_mel.squeeze(0)
            log_mel_length = torch.tensor(log_mel.shape[1]).long()

        durations = None
        if Durations in self.sup_data_types_set:
            durations = self.durs[index]

        duration_prior = None
        if DurationPrior in self.sup_data_types_set:
            if self.use_beta_binomial_interpolator:
                mel_len = self.get_log_mel(audio).shape[2]
                duration_prior = torch.from_numpy(
                    self.beta_binomial_interpolator(mel_len,
                                                    text_length.item()))
            else:
                prior_path = Path(self.sup_data_path) / f"pr_{audio_stem}.pt"

                if prior_path.exists():
                    duration_prior = torch.load(prior_path)
                else:
                    mel_len = self.get_log_mel(audio).shape[2]
                    duration_prior = beta_binomial_prior_distribution(
                        text_length, mel_len)
                    duration_prior = torch.from_numpy(duration_prior)
                    torch.save(duration_prior, prior_path)

        pitch, pitch_length = None, None
        if Pitch in self.sup_data_types_set:
            pitch_name = (f"{audio_stem}_pitch_pyin_"
                          f"fmin{self.pitch_fmin}_fmax{self.pitch_fmax}_"
                          f"fl{self.win_length}_hs{self.hop_len}.pt")

            pitch_path = Path(self.sup_data_path) / pitch_name
            if pitch_path.exists():
                pitch = torch.load(pitch_path).float()
            else:
                pitch, _, _ = librosa.pyin(
                    audio.numpy(),
                    fmin=self.pitch_fmin,
                    fmax=self.pitch_fmax,
                    frame_length=self.win_length,
                    sr=self.sample_rate,
                    fill_na=0.0,
                )
                pitch = torch.from_numpy(pitch).float()
                torch.save(pitch, pitch_path)

            if self.pitch_avg is not None and self.pitch_std is not None and self.pitch_norm:
                pitch -= self.pitch_avg
                pitch[
                    pitch == -self.
                    pitch_avg] = 0.0  # Zero out values that were perviously zero
                pitch /= self.pitch_std

            pitch_length = torch.tensor(len(pitch)).long()

        energy, energy_length = None, None
        if Energy in self.sup_data_types_set:
            energy_path = Path(
                self.sup_data_path
            ) / f"{audio_stem}_energy_wl{self.win_length}_hs{self.hop_len}.pt"
            if energy_path.exists():
                energy = torch.load(energy_path).float()
            else:
                spec = self.get_spec(audio)
                energy = torch.linalg.norm(spec.squeeze(0), axis=0).float()
                torch.save(energy, energy_path)

            energy_length = torch.tensor(len(energy)).long()

        speaker_id = None
        if SpeakerID in self.sup_data_types_set:
            speaker_id = torch.tensor(sample["speaker_id"]).long()

        return (
            audio,
            audio_length,
            text,
            text_length,
            log_mel,
            log_mel_length,
            durations,
            duration_prior,
            pitch,
            pitch_length,
            energy,
            energy_length,
            speaker_id,
        )
Ejemplo n.º 24
0
def get_first_null_f0(items_handler: ItemsHandler,
                      start_offset: float,
                      min_duration: float,
                      end_offset: ty.Optional[float] = None,
                      min_note: str = 'C1',
                      max_note: str = 'C7',
                      frame_length: float = 2048,
                      win_length: ty.Optional[float] = None,
                      offset_units: LengthUnit = LengthUnit.ms,
                      length_units: LengthUnit = LengthUnit.samples) -> float:
    audio = items_handler.load_audio()[0]
    sr = items_handler.sr

    if length_units != LengthUnit.samples:
        if length_units != LengthUnit.ms:
            raise TypeError('length_units can be only of ms or samples')
        frame_length = length_convert(frame_length, sr, length_units,
                                      LengthUnit.samples)

        if win_length:
            win_length = length_convert(win_length, sr, length_units,
                                        LengthUnit.samples)
    hop_length = int(frame_length // 4)
    start_offset_int = ty.cast(
        int, length_convert(start_offset, sr, offset_units,
                            LengthUnit.samples))

    if start_offset_int:
        audio = audio[start_offset_int:]  # type:ignore
    if end_offset:
        end_offset_int = ty.cast(
            int,
            length_convert(end_offset, sr, offset_units, LengthUnit.samples))
        audio = audio[:end_offset_int - start_offset_int]  # type:ignore
    min_duration_frms = length_convert(min_duration,
                                       sr,
                                       offset_units,
                                       LengthUnit.frames,
                                       hop_length=hop_length)
    fmin, fmax = lr.note_to_hz(min_note), lr.note_to_hz(max_note)
    f0s, v_flag, v_prob = lr.pyin(
        audio,
        fmin=fmin,
        fmax=fmax,
        sr=sr,
        win_length=None if win_length is None else win_length,
        frame_length=frame_length,
    )
    # print(list(zip(f0s, v_flag)))
    nulls = np.where(~v_flag)
    # print(nulls)
    for idx, val in enumerate(nulls[0]):
        # print(val)
        if val >= min_duration_frms:
            # print(val, v_flag[val + 1])
            if v_flag[val + 1]:
                # print(f'skipping {val}')
                continue
            break

    if val < 5:
        raise PitchError(
            f'Cannot find null f0 at the reasonable frame (>=5): {v_flag}')
    val_normalized = length_convert(val,
                                    sr,
                                    LengthUnit.frames,
                                    offset_units,
                                    hop_length=hop_length)
    # print(val_normalized, )
    return start_offset + val_normalized