Ejemplo n.º 1
0
def extract_cfp(filename, down_fs=44100, **kwargs):
    """CFP feature extraction function.

    Given the audio path, returns the CFP feature. Will automatically process
    the feature in parallel to accelerate the computation.

    Parameters
    ----------
    filename: Path
        Path to the audio.
    hop: float
        Hop size in seconds, with regard to the sampling rate.
    win_size: int
        Window size.
    fr: float
        Frequency resolution.
    fc: float
        Lowest start frequency.
    tc: float
        Inverse number of the highest frequency bound.
    g: list[float]
        Power factor of the output STFT results.
    bin_per_octave: int
        Number of bins in each octave.
    down_fs: int
        Resample to this sampling rate, if the loaded audio has a different value.
    max_sample: int
        Maximum number of frames to be processed for each computation. Adjust to
        a smaller number if your RAM is not enough.

    Returns
    -------
    Z
        Multiplication of spectrum and cepstrum
    tfrL0
        Spectrum of the audio.
    tfrLF
        Generalized Cepstrum of Spectrum (GCoS).
    tfrLQ
        Cepstrum of the audio
    cen_freq
        Central frequencies to each feature.

    References
    ----------
    The CFP approach was first proposed in [1]_

    .. [1] L. Su and Y. Yang, "Combining Spectral and Temporal Representations for Multipitch Estimation of Polyphonic
       Music," in IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2015.
    """
    logger.debug("Loading audio: %s", filename)
    x, fs = load_audio(filename, sampling_rate=down_fs)
    return _extract_cfp(x, fs, down_fs=fs, **kwargs)
Ejemplo n.º 2
0
def extract_cqt(audio_path,
                sampling_rate=44100,
                lowest_note=16,
                note_num=120,
                a_hop=256,
                pad_sec=1):
    """
    Compute some audio data's constant-Q spectrogram, normalize, and log-scale
    it

    Parameters
    ----------
    audio_data: Path
        Path to the input audio.
    sampling_rate: int
        Sampling rate the audio data is sampled at, should be ``DOWN_SAMPLE_TO_SAPMLING_RATE``.
    lowest_note: int
        Lowest MIDI note number.
    note_num: int
        Number of total notes. The highest note number would thus be `lowest_note` + `note_num`.
    a_hop: int
        Hop size for computing CQT.
    pad_sec: float
        Length of padding to the begin and the end of the raw audio data in seconds.

    Returns
    -------
    midi_gram: np.ndarray
        Log-magnitude, L2-normalized constant-Q spectrogram of synthesized MIDI
        data.
    """
    logger.debug("Loading audio: %s", audio_path)
    audio_data, _ = load_audio(audio_path, sampling_rate=sampling_rate)

    zeros = np.zeros(pad_sec * sampling_rate)
    padded_audio = np.concatenate([zeros, audio_data, zeros])

    # Compute CQT of the synthesized audio data
    logger.debug("Extracting CQT feature with librosa")
    audio_gram = librosa.cqt(padded_audio,
                             sr=sampling_rate,
                             hop_length=a_hop,
                             fmin=librosa.midi_to_hz(lowest_note),
                             n_bins=note_num)

    # L2-normalize and log-magnitute it
    logger.debug("Post-processing CQT feature...")
    return post_process_cqt(audio_gram)
Ejemplo n.º 3
0
def extract_beat_with_madmom(audio_path, sampling_rate=44100):
    """Extract beat position (in seconds) of the audio.

    Extract beat with mixture of beat tracking techiniques using madmom.

    Parameters
    ----------
    audio_path: Path
        Path to the target audio
    sampling_rate: int
        Desired sampling to be resampled.

    Returns
    -------
    beat_arr: 1D numpy array
        Contains beat positions in seconds.
    audio_len_sec: float
        Total length of the audio in seconds.
    """
    logger.debug("Loading audio: %s", audio_path)
    audio_data, _ = load_audio(audio_path, sampling_rate=sampling_rate)
    logger.debug("Runnig beat tracking...")
    return MadmomBeatTracking().process(
        audio_data), len(audio_data) / sampling_rate
Ejemplo n.º 4
0
    def transcribe(self, input_audio, model_path=None, output="./"):
        """Transcribe vocal notes in the audio.

        This function transcribes onset, offset, and pitch of the vocal in the audio.
        This module is reponsible for predicting onset and offset time of each note,
        and pitches are estimated by the `vocal-contour` submodule.

        Parameters
        ----------
        input_audio: Path
            Path to the raw audio file (.wav).
        model_path: Path
            Path to the trained model or the supported transcription mode.
        output: Path (optional)
            Path for writing out the transcribed MIDI file. Default to the current path.

        Returns
        -------
        midi: pretty_midi.PrettyMIDI
            The transcribed vocal notes.

        Outputs
        -------
        This function will outputs three files as listed below:

        - <song>.mid: the MIDI file with complete transcription results in piano sondfount.
        - <song>_f0.csv: pitch contour information of the vocal.
        - <song>_trans.wav: the rendered pitch contour audio.

        See Also
        --------
        omnizart.cli.vocal.transcribe: CLI entry point of this function.
        omnizart.vocal_contour.transcribe: Pitch estimation function.
        """
        logger.info("Separating vocal track from the audio...")
        separator = Separator('spleeter:2stems')

        # Tricky way to avoid the annoying tensorflow graph being finalized issue.
        separator._params["stft_backend"] = "librosa"  # pylint: disable=protected-access

        wav, fs = load_audio(input_audio, mono=False)
        pred = separator.separate(wav)

        logger.info("Loading model...")
        model, model_settings = self._load_model(model_path)

        logger.info("Extracting feature...")
        wav = librosa.to_mono(pred["vocals"].squeeze().T)
        feature = _extract_vocal_cfp(
            wav,
            fs,
            down_fs=model_settings.feature.sampling_rate,
            hop=model_settings.feature.hop_size,
            fr=model_settings.feature.frequency_resolution,
            fc=model_settings.feature.frequency_center,
            tc=model_settings.feature.time_center,
            g=model_settings.feature.gamma,
            bin_per_octave=model_settings.feature.bins_per_octave)

        logger.info("Predicting...")
        pred = predict(feature, model)

        logger.info("Infering notes...")
        interval = infer_interval(
            pred,
            ctx_len=model_settings.inference.context_length,
            threshold=model_settings.inference.threshold,
            min_dura=model_settings.inference.min_duration,
            t_unit=model_settings.feature.hop_size)

        logger.info("Extracting pitch contour")
        agg_f0 = vcapp.app.transcribe(
            input_audio,
            model_path=model_settings.inference.pitch_model,
            output=output)

        logger.info("Inferencing MIDI...")
        midi = infer_midi(interval,
                          agg_f0,
                          t_unit=model_settings.feature.hop_size)

        self._output_midi(output=output, input_audio=input_audio, midi=midi)
        logger.info("Transcription finished")
        return midi
Ejemplo n.º 5
0
def test_load_audio():
    audio = "./tests/resource/sample.wav"
    data, fs = io.load_audio(audio, sampling_rate=44100, mono=False)
    assert fs == 44100
    assert data.shape == (2065124, 2)
Ejemplo n.º 6
0
def extract_vocal_cfp(filename, down_fs=16000, **kwargs):
    """Specialized CFP feature extraction for vocal submodule."""
    logger.debug("Loading audio: %s", filename)
    x, fs = load_audio(filename, sampling_rate=down_fs)
    logger.debug("Extracting vocal feature")
    return _extract_vocal_cfp(x, fs, **kwargs)
Ejemplo n.º 7
0
    def transcribe(self, input_audio, model_path=None, output="./"):
        """Transcribe vocal notes in the audio.

        This function transcribes onset, offset, and pitch of the vocal in the audio.
        This module is reponsible for predicting onset and offset time of each note,
        and pitches are estimated by the `vocal-contour` submodule.

        Parameters
        ----------
        input_audio: Path
            Path to the raw audio file (.wav).
        model_path: Path
            Path to the trained model or the supported transcription mode.
        output: Path (optional)
            Path for writing out the transcribed MIDI file. Default to the current path.

        Returns
        -------
        midi: pretty_midi.PrettyMIDI
            The transcribed vocal notes.

        Outputs
        -------
        This function will outputs three files as listed below:

        - <song>.mid: the MIDI file with complete transcription results in piano sondfount.
        - <song>_f0.csv: pitch contour information of the vocal.
        - <song>_trans.wav: the rendered pitch contour audio.

        See Also
        --------
        omnizart.cli.vocal.transcribe: CLI entry point of this function.
        omnizart.vocal_contour.transcribe: Pitch estimation function.
        """
        logger.info("Separating vocal track from the audio...")
        command = ["spleeter", "separate", input_audio, "-o", "./"]
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        _, error = process.communicate()
        if process.returncode != 0:
            raise SpleeterError(error.decode("utf-8"))

        # Resolve the path of separated output files
        folder_path = jpath("./", get_filename(input_audio))
        vocal_wav_path = jpath(folder_path, "vocals.wav")
        wav, fs = load_audio(vocal_wav_path)

        # Clean out the output files
        shutil.rmtree(folder_path)

        logger.info("Loading model...")
        model, model_settings = self._load_model(model_path)

        logger.info("Extracting feature...")
        feature = _extract_vocal_cfp(
            wav,
            fs,
            down_fs=model_settings.feature.sampling_rate,
            hop=model_settings.feature.hop_size,
            fr=model_settings.feature.frequency_resolution,
            fc=model_settings.feature.frequency_center,
            tc=model_settings.feature.time_center,
            g=model_settings.feature.gamma,
            bin_per_octave=model_settings.feature.bins_per_octave
        )

        logger.info("Predicting...")
        pred = predict(feature, model)

        logger.info("Infering notes...")
        interval = infer_interval(
            pred,
            ctx_len=model_settings.inference.context_length,
            threshold=model_settings.inference.threshold,
            min_dura=model_settings.inference.min_duration,
            t_unit=model_settings.feature.hop_size
        )

        logger.info("Extracting pitch contour")
        agg_f0 = vcapp.app.transcribe(input_audio, model_path=model_settings.inference.pitch_model, output=output)

        logger.info("Inferencing MIDI...")
        midi = infer_midi(interval, agg_f0, t_unit=model_settings.feature.hop_size)

        self._output_midi(output=output, input_audio=input_audio, midi=midi)
        logger.info("Transcription finished")
        return midi