Beispiel #1
0
def preprocess(filename, timidity, latency, truncate, pad=1, get_raw=False):
    """
    Preprocess an audio file ands its MIDI counterpart. Computes transforms and labels.
    :param filename: audio filename
    :param timidity: set to True if the files was rendered with timidity
    :param latency: in seconds
    :param truncate: in seconds (0 for no truncation)
    :param pad: in seconds, will be added at the start and end before spectral transforms
    :param get_raw: set to True to return raw computed spectrograms (e.g. for visualization)
    :return:
    """
    filename_midi = filename.rsplit('.')[0] + '.mid'

    dname = filename.replace('/', '_').replace('\\', '_')

    # Load files
    ipad = int(pad * 44100)
    audio_pad = (ipad, ipad
                 )  # add one blank second at the beginning and at the end
    if truncate > 0:
        audio = AudioFile(filename,
                          truncate=int(truncate * 44100),
                          pad=audio_pad)
    else:
        audio = AudioFile(filename, pad=audio_pad)
    mid = MidiFile(filename_midi)

    step = 0.02  # seconds
    latency = int(round(latency / step, 0))

    # Compute spectrograms
    spectrograms = ComputeSpectrograms(audio, step=step)

    # Compute filtered spectrograms
    melgrams = ComputeMelLayers(spectrograms, step, audio.Fs, latency)

    # Build the input tensor
    cnn_window = 15
    tensor_mel = BuildTensor(melgrams[:, 2], cnn_window)

    # Compute CQT
    FreqAxisLog, time, cqgram = ComputeCqt(audio,
                                           200.,
                                           4000.,
                                           step,
                                           latency,
                                           r=3)
    tensor_cqt = BuildTensor([
        cqgram,
    ], cnn_window)

    # Global data length
    max_len = min(tensor_mel.shape[0], tensor_cqt.shape[0])

    # Compute output labels
    notes = mid.getNotes(timidity)
    notes_onset = np.array(notes)[:, 0]  # get only the note timing
    notes_value = np.array(notes, dtype=np.int)[:, 1]  # get only the key value

    onset_labels = np.zeros(max_len)
    onset_caracs = np.zeros((max_len, 5))
    onset_caracs[:, 2] = np.arange(max_len)

    note_low = 21  # lowest midi note on a keyboard
    note_high = 108  # highest midi note on a keyboard

    notes_labels = np.zeros((max_len, note_high - note_low + 1))
    notes_caracs = np.zeros((max_len, note_high - note_low + 1))

    for i in range(len(notes_onset)):
        t_win = int(np.floor(
            (notes_onset[i] + audio_pad[0] / audio.Fs) / step))
        if t_win >= len(onset_labels):
            break
        if t_win >= 0:
            onset_labels[t_win] = 1
            onset_caracs[t_win][0] += 1  # nb_notes
            onset_caracs[t_win][1] = max(onset_caracs[t_win][1],
                                         notes[i][2])  # max volume
            if t_win + 1 < len(onset_labels):
                onset_caracs[t_win + 1:, 2] -= onset_caracs[t_win + 1][
                    2]  # nb of blank windows since the last onset

            n = notes_value[i] - note_low
            notes_labels[t_win][n] = 1
            notes_caracs[t_win][n] = notes[i][2]  # volume

    counter = 0
    for i in range(len(onset_labels) - 1, -1, -1):
        onset_caracs[i][3] = counter
        if onset_labels[i] == 1:
            counter = 0
        else:
            counter += 1
    onset_caracs[:, 4] = np.minimum(onset_caracs[:, 2], onset_caracs[:, 3])

    # Extract useful CQT
    select = [i for i in range(max_len) if onset_labels[i] > 0]
    tensor_cqt_select = np.take(tensor_cqt, select, axis=0)
    notes_labels_select = np.take(notes_labels, select, axis=0)
    notes_caracs_select = np.take(notes_caracs, select, axis=0)

    if not get_raw:
        return (tensor_mel[:max_len, ...], tensor_cqt_select, onset_labels,
                onset_caracs, notes_labels_select, notes_caracs_select, dname)
    else:
        return (melgrams, tensor_mel, onset_labels, cqgram, tensor_cqt, time,
                FreqAxisLog, max_len, step)