Ejemplo n.º 1
0
def _process_utterance(out_dir,wav_path,sp2ind_dir,text):
    sp_f = open(sp2ind_dir,'r')
    sp2ind = json.load(sp_f)
    
    sp = wav_path.split('/')[-1].split('.')[0].split('_')[0]
    if sp in sp2ind:
        sp_ind = sp2ind[sp]
    else:
        sp_ind = -1
       
    wav = audio.load_wav(wav_path)
    if not 'test' in wav_path:
        wav,_ = librosa.effects.trim(wav,top_db=60,frame_length=2048,hop_length=512)

    if hparams.highpass_cutoff > 0.0:
        wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff)

    if is_mulaw_quantize(hparams.input_type):
        # Trim silences in mul-aw quantized domain
        silence_threshold = 0
        if silence_threshold > 0:
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
            start, end = audio.start_and_end_indices(out, silence_threshold)
            wav = wav[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        constant_values = P.mulaw(0.0, hparams.quantize_channels - 1)
        out_dtype = np.float32
    else:
        # [-1, 1]
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T
    mfcc = audio.mfcc(wav).astype(np.float32).T
    if hparams.global_gain_scale > 0:
        wav *= hparams.global_gain_scale

    # Time domain preprocessing
    if hparams.preprocess is not None and hparams.preprocess not in ["", "none"]:
        f = getattr(audio, hparams.preprocess)
        wav = f(wav)

    # Clip
    if np.abs(wav).max() > 1.0:
        print("""Warning: abs max value exceeds 1.0: {}""".format(np.abs(wav).max()))
        # ignore this sample
        #return ("dummy", "dummy","dummy", -1,-1, "dummy")

    wav = np.clip(wav, -1.0, 1.0)

    # Set waveform target (out)
    if is_mulaw_quantize(hparams.input_type):
        out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        out = P.mulaw(wav, hparams.quantize_channels - 1)
    else:
        out = wav

    # zero pad
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size())
    if l > 0 or r > 0:
        out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    # Write the spectrograms to disk:
    #name = splitext(basename(wav_path))[0]
    #audio_filename = '%s-wave.npy' % (name)
    #mel_filename = '%s-feats.npy' % (name)
    audio_filename = f'{out_dir}wave.npy'
    mel_filename = f'{out_dir}mel.npy'
    mfcc_filename = f'{out_dir}mfcc.npy'
    assert mfcc.shape[0] == N
    np.save(audio_filename,
            out.astype(out_dtype), allow_pickle=False)
    np.save(mel_filename,
            mel_spectrogram.astype(np.float32), allow_pickle=False)
    np.save(mfcc_filename,
            mfcc.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (out_dir, N, sp_ind,text)
Ejemplo n.º 2
0
def _process_utterance(out_dir, index, wav_path, text, trim_silence=False):
    # Load the audio to a numpy array:

    wav = audio.load_wav(wav_path)

    # Trim begin/end silences
    # NOTE: the threshold was chosen for clean signals
    # TODO: Remove, get this out of here.
    if trim_silence:
        wav, _ = librosa.effects.trim(wav,
                                      top_db=60,
                                      frame_length=2048,
                                      hop_length=512)

    if hparams.highpass_cutoff > 0.0:
        wav = audio.low_cut_filter(wav, hparams.sample_rate,
                                   hparams.highpass_cutoff)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # Trim silences in mul-aw quantized domain
        silence_threshold = 0
        if silence_threshold > 0:
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
            start, end = audio.start_and_end_indices(out, silence_threshold)
            wav = wav[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        constant_values = P.mulaw(0.0, hparams.quantize_channels - 1)
        out_dtype = np.float32
    else:
        # [-1, 1]
        constant_values = 0.0
        out_dtype = np.float32

    # Compute a mel-scale spectrogram from the trimmed wav:
    # (N, D)
    mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T

    if hparams.global_gain_scale > 0:
        wav *= hparams.global_gain_scale

    # Time domain preprocessing
    if hparams.preprocess is not None and hparams.preprocess not in [
            "", "none"
    ]:
        f = getattr(audio, hparams.preprocess)
        wav = f(wav)

    # Clip
    if np.abs(wav).max() > 1.0:
        print("""Warning: abs max value exceeds 1.0: {}""".format(
            np.abs(wav).max()))
        # ignore this sample
        return ("dummy", "dummy", -1, "dummy")

    wav = np.clip(wav, -1.0, 1.0)

    # Set waveform target (out)
    if is_mulaw_quantize(hparams.input_type):
        out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        out = P.mulaw(wav, hparams.quantize_channels - 1)
    else:
        out = wav

    # zero pad
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size())
    if l > 0 or r > 0:
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
    N = mel_spectrogram.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    assert_ready_for_upsampling(out, mel_spectrogram, cin_pad=0, debug=True)

    # Write the spectrograms to disk:
    name = splitext(basename(wav_path))[0]
    audio_filename = "%s-wave.npy" % (name)
    mel_filename = "%s-feats.npy" % (name)
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(
        os.path.join(out_dir, mel_filename),
        mel_spectrogram.astype(np.float32),
        allow_pickle=False,
    )

    # Return a tuple describing this training example:
    return (audio_filename, mel_filename, N, text)
Ejemplo n.º 3
0
def _process_song(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Trim begin/end silences
    # NOTE: the threshold was chosen for clean signals
    wav, _ = librosa.effects.trim(wav,
                                  top_db=60,
                                  frame_length=2048,
                                  hop_length=512)

    if hparams.highpass_cutoff > 0.0:
        wav = audio.low_cut_filter(wav, hparams.sample_rate,
                                   hparams.highpass_cutoff)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # Trim silences in mul-aw quantized domain
        silence_threshold = 0
        if silence_threshold > 0:
            # [0, quantize_channels)
            out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
            start, end = audio.start_and_end_indices(out, silence_threshold)
            wav = wav[start:end]
        constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1)
        out_dtype = np.int16
    elif is_mulaw(hparams.input_type):
        # [-1, 1]
        constant_values = P.mulaw(0.0, hparams.quantize_channels - 1)
        out_dtype = np.float32
    else:
        # [-1, 1]
        constant_values = 0.0
        out_dtype = np.float32

    #### CLAIRE Work here
    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
    os.makedirs('./pwavs', exist_ok=True)
    pwav_path = './pwavs/{0}.wav'.format(wav_name)
    scipy.io.wavfile.write(pwav_path, 16000, wav)
    # make the chord directory if it does not exist
    chord_dir = "chord_dir"
    os.makedirs(chord_dir, exist_ok=True)

    # create xml file with notes and timestamps
    #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True)
    #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir))
    os.system('./extract_chromagram.sh {0} {1} > /dev/null 2>&1'.format(
        pwav_path, chord_dir))

    note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name)

    #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords.
    # vector with 1 in row for each note played
    # 1000 samples per second
    note_samples = int(len(wav) / 2048)
    # 12 notes per octave
    chords_time_series = np.zeros((24, note_samples))

    #print(np.shape(chords_time_series))

    with open(note_filename, newline='\n') as csvfile:
        #chordreader = csv.reader(csvfile, delimeter=',')
        chordreader = csvfile.readlines()
        #print(chordreader)
        for idx, row in enumerate(chordreader):
            row = row.split(",")
            chromogram_samples = np.array(row).astype(np.float)[1:]
            chords_time_series[:, idx] = chromogram_samples
    chords_time_series = chords_time_series.T

    # if hparams.global_gain_scale > 0:
    #     wav *= hparams.global_gain_scale

    # Time domain preprocessing
    if hparams.preprocess is not None and hparams.preprocess not in [
            "", "none"
    ]:
        f = getattr(audio, hparams.preprocess)
        wav = f(wav)

    # wav = np.clip(wav, -1.0, 1.0)

    # Set waveform target (out)
    if is_mulaw_quantize(hparams.input_type):
        out = P.mulaw_quantize(wav, hparams.quantize_channels - 1)
    elif is_mulaw(hparams.input_type):
        out = P.mulaw(wav, hparams.quantize_channels - 1)
    else:
        out = wav

    # zero pad
    # this is needed to adjust time resolution between audio and mel-spectrogram
    l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size())
    if l > 0 or r > 0:
        out = np.pad(out, (l, r),
                     mode="constant",
                     constant_values=constant_values)
    N = chords_time_series.shape[0]
    assert len(out) >= N * audio.get_hop_size()

    # time resolution adjustment
    # ensure length of raw audio is multiple of hop_size so that we can use
    # transposed convolution to upsample
    out = out[:N * audio.get_hop_size()]
    assert len(out) % audio.get_hop_size() == 0

    # Write the spectrograms to disk:
    name = splitext(basename(wav_path))[0]
    audio_filename = '%s-wave.npy' % (name)
    chords_filename = '%s-feats.npy' % (name)
    np.save(os.path.join(out_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(out_dir, chords_filename),
            chords_time_series.astype(out_dtype),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (audio_filename, chords_filename, N, text)