def process_files(files, duration = 2):
    sr = 22050
    n_fft = 512
    win_length = 400
    hop_length = 80

    counter = 0

    data_x = []
    data_y = []

    for file in files:
        counter += 1
        if counter % 1000 == 0: print("Counter: " + str(counter) + " on folder ")

        try:
            wav_x = read_wav(file, sr, duration)
            spec_x, _ = wav2spec(wav_x, n_fft, win_length, hop_length, False)
            data_x.append(np.swapaxes(spec_x, 0, 1))

            wav_y = read_wav(file.replace('wav_1', 'wav_6'), sr, duration)
            spec_y, _ = wav2spec(wav_y, n_fft, win_length, hop_length, False)
            data_y.append(np.swapaxes(spec_y, 0, 1))

        except:
            print("error doing processing: " + file)

    np.save("H:/cs230/spec_{}sec/data_x".format(duration), data_x)
    np.save("H:/cs230/spec_{}sec/data_y".format(duration), data_y)
Example #2
0
def get_mfccs_and_spectrogram(wav_file,
                              trim=False,
                              random_crop=False,
                              isConverting=False):
    '''This is applied in `train2`, `test2` or `convert` phase.
    '''

    # Load
    wav = read_wav(wav_file, sr=hp.default.sr)

    # Trim
    if trim:
        wav, _ = librosa.effects.trim(wav,
                                      frame_length=hp.default.win_length,
                                      hop_length=hp.default.hop_length)

    if random_crop:
        wav = wav_random_crop(wav, hp.default.sr, hp.default.duration)

    # Padding or crop if not Converting
    if isConverting is False:
        length = int(hp.default.sr * hp.default.duration)
        wav = librosa.util.fix_length(wav, length)

    return _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft,
                              hp.default.win_length, hp.default.hop_length)
Example #3
0
    def get_random_wav_and_label(self, tar_wavfiles, ntar_wavfiles):
        """

        :return: wav: raw wave. float32. shape=(t, ),
                 label: 1 if target, 0 otherwise. int32.
                 melspec: mel-spectrogram. float32. shape=(t, n_mels)
        """
        wavfiles, label = (
            tar_wavfiles,
            self.tar_labels) if np.random.sample(1) <= self.tar_ratio else (
                ntar_wavfiles, self.ntar_labels)
        wavfile = wavfiles[np.random.randint(0, len(wavfiles))]
        if type(wavfile) == bytes:
            wavfile = wavfile.decode()
        if wavfile.endswith('arr'):  # pyarrow format
            wav = read_wav_from_arr(wavfile)
        else:
            wav = read_wav(wavfile, sr=hp.signal.sr)
        wav = trim_wav(wav)

        wav = crop_random_wav(wav, self.length)
        wav = augment_volume(wav)
        wav = fix_length(wav, self.length)  # padding
        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        melspec = np.float32(melspec)
        label = np.float32(label)
        return wav, melspec, label
    def _get_wav_and_melspec(wav_file, length, is_training=True):
        '''
        the range of values of wav is [-1, 1].
        '''

        wav = read_wav(wav_file, sr=hp.signal.sr)
        wav = trim_wav(wav)
        # divide wav into chunks that have the given length and one is randomly selected in training, but first chunk in generation.
        n_clips = math.ceil(len(wav) / length) if is_training else 1
        idx = random.randrange(n_clips)
        start, end = length * idx, length * (idx + 1)
        wav = wav[start:end]
        assert (len(wav) <= length)
        wav = fix_length(wav, length)  # padding in case of last chunk.

        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        wav = np.expand_dims(wav, -1)
        return wav, melspec.astype(np.float32)
Example #5
0
def do_inference(num_tests, concurrency=1):
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    coord = _Coordinator(num_tests, concurrency)

    for _ in range(num_tests):
        # dummy audio
        duration, sr, n_fft, win_length, hop_length, n_mels, max_db, min_db = 4, 16000, 512, 512, 128, 80, 35, -55
        filename = librosa.util.example_audio_file()
        wav = read_wav(filename, sr=sr, duration=duration)
        mel = wav2melspec_db(wav, sr, n_fft, win_length, hop_length, n_mels)
        mel = normalize_db(mel, max_db=max_db, min_db=min_db)
        mel = mel.astype(np.float32)
        mel = np.expand_dims(mel, axis=0)  # single batch
        n_timesteps = sr / hop_length * duration + 1

        # build request
        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'voice_vector'
        request.model_spec.signature_name = 'predict'
        request.inputs['x'].CopyFrom(
            tf.contrib.util.make_tensor_proto(mel,
                                              shape=[1, n_timesteps, n_mels]))

        coord.throttle()

        # send asynchronous response (recommended. use this.)
        result_future = stub.Predict.future(request, 10.0)  # timeout
        result_future.add_done_callback(_create_rpc_callback(coord))

        # send synchronous response (NOT recommended)
        # result = stub.Predict(request, 5.0)

    coord.wait_all_done()
Example #6
0
 def _load_random_wav(self, speaker_id):
     wavfile = self.audio_meta.get_random_audio(speaker_id)
     wav = read_wav(wavfile, hp.signal.sr)
     # wav = trim_wav(wav)
     length = int(hp.signal.duration * hp.signal.sr)
     wav = crop_random_wav(wav, length=length)
     wav = fix_length(wav, length, mode='reflect')
     return wav  # (t, n_mel)
Example #7
0
def make_softer(audio_file):

    
    # the following function will play the .wav file
    audio.play(audio_file)
    
    # storing the audio as a list of floats
    samples = audio.read_wav(audio_file)
Example #8
0
def wav_to_spec_inverted(file):
    wav_x = read_wav(file, sr, duration)
    spec_x, _ = wav2spec(wav_x, n_fft, win_length, hop_length, False)

    spec_x_padding = np.array(spec_x[:, 0:300])
    spec_x_padding /= np.max(spec_x_padding)
    spec_x_padding.resize((257, 300))

    return np.swapaxes(spec_x_padding, 0, 1)
Example #9
0
def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
    '''This is applied in `train1` or `test1` phase.
    '''

    # Load
    wav = read_wav(wav_file, sr=16000)  #hp.sr)

    mfccs, _, _ = _get_mfcc_and_spec(
        wav,
        0.97,
        512,  #hp.preemphasis, hp.n_fft,
        400,  #hp.win_length,
        80)  #hp.hop_length)

    # timesteps
    num_timesteps = mfccs.shape[0]

    # phones (targets)
    phn_file = wav_file.replace("WAV", "PHN").replace("wav", "PHN")
    phn2idx, idx2phn = load_vocab()
    phns = np.zeros(shape=(num_timesteps, ))
    bnd_list = []
    for line in open(phn_file, 'r').read().splitlines():
        start_point, _, phn = line.split()
        bnd = int(start_point) // 80  #hp.hop_length
        phns[bnd:] = phn2idx[phn]
        bnd_list.append(bnd)

    # Trim
    if trim:
        start, end = bnd_list[1], bnd_list[-1]
        mfccs = mfccs[start:end]
        phns = phns[start:end]
        assert (len(mfccs) == len(phns))

    # Random crop
    n_timesteps = (
        3 * 16000) // 80 + 1  # (hp.duration * hp.sr) // hp.hop_length + 1
    if random_crop:
        start = np.random.choice(
            range(np.maximum(1,
                             len(mfccs) - n_timesteps)), 1)[0]
        end = start + n_timesteps
        mfccs = mfccs[start:end]
        phns = phns[start:end]
        assert (len(mfccs) == len(phns))

    # Padding or crop
    mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0)
    phns = librosa.util.fix_length(phns, n_timesteps, axis=0)

    return mfccs, phns
Example #10
0
def get_mfccs_and_phones(wav_file, trim=False, random_crop=True):
    '''This is applied in `train1` or `test1` phase.
    '''

    # Load
    wav = read_wav(wav_file, sr=hp.default.sr)

    mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis,
                                     hp.default.n_fft, hp.default.win_length,
                                     hp.default.hop_length)

    # timesteps
    num_timesteps = mfccs.shape[0]

    # phones (targets)
    # phn_file = wav_file.replace("WAV.wav", "PHN").replace("wav", "PHN")
    # phn2idx, idx2phn = load_vocab()
    phns = np.zeros(shape=(num_timesteps, ))
    bnd_list = []
    # for line in open(phn_file, 'r').read().splitlines():
    #     start_point, _, phn = line.split()
    #     bnd = int(start_point) // hp.default.hop_length
    #     phns[bnd:] = phn2idx[phn]
    #     bnd_list.append(bnd)

    # Trim
    # if trim:
    #     start, end = bnd_list[1], bnd_list[-1]
    #     mfccs = mfccs[start:end]
    #     phns = phns[start:end]
    #     assert (len(mfccs) == len(phns))

    # Random crop
    # Ollin: aaah + 1
    n_timesteps = (hp.default.duration *
                   hp.default.sr) // hp.default.hop_length + 1
    # if random_crop:
    #     start = np.random.choice(range(np.maximum(1, len(mfccs) - n_timesteps)), 1)[0]
    #     end = start + n_timesteps
    #     mfccs = mfccs[start:end]
    #     phns = phns[start:end]
    #     assert (len(mfccs) == len(phns))

    # Padding or crop
    mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0)
    phns = librosa.util.fix_length(phns, n_timesteps, axis=0)

    return mfccs, phns
Example #11
0
 def get_random_wav(self, wavfile):
     """
     :param: wavfile: a raw wave file.
     :return: wav: raw wave. float32. shape=(t, ),
              melspec: mel-spectrogram. float32. shape=(t, n_mels),
              wavfile: the raw wave file.
     """
     wav = read_wav(wavfile, sr=hp.signal.sr)
     wav = trim_wav(wav)
     wav = fix_length(wav, self.length)  # crop from the beginning.
     melspec = wav2melspec_db(wav,
                              sr=hp.signal.sr,
                              n_fft=hp.signal.n_fft,
                              win_length=hp.signal.win_length,
                              hop_length=hp.signal.hop_length,
                              n_mels=hp.signal.n_mels,
                              min_db=hp.signal.min_db,
                              max_db=hp.signal.max_db)
     melspec = np.float32(melspec)
     return wav, melspec, wavfile
    def _get_wav_and_melspec(wav_file, length=None, is_training=True):
        wav = read_wav(wav_file, sr=hp.signal.sr)
        wav = trim_wav(wav)
        if length:
            n_clips = math.ceil(len(wav) / length) if is_training else 1
            idx = random.randrange(n_clips)
            start, end = length * idx, length * (idx + 1)
            wav = wav[start:end]
            assert (len(wav) <= length)
            wav = fix_length(wav, length)  # padding

        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        wav = np.expand_dims(wav, -1)
        return wav, melspec.astype(np.float32)
Example #13
0
def get_mfccs_and_phones(wav_file, trim=False):

    '''This is applied in `train1` or `test1` phase.
    '''

    # Load
    wav = read_wav(wav_file, sr=hp.default.sr)

    mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft,
                                     hp.default.win_length,
                                     hp.default.hop_length)

    # timesteps
    num_timesteps = mfccs.shape[0]

    # phones (targets)
    phn_file = wav_file.replace("WAV", "PHN")
    phn2idx, idx2phn = load_vocab()
    phns = np.zeros(shape=(num_timesteps,))
    bnd_list = []
    for line in open(phn_file, 'r').read().splitlines():
        start_point, _, phn = line.split()
        bnd = int(start_point) // hp.default.hop_length
        phns[bnd:] = phn2idx[phn]
        bnd_list.append(bnd)

    # Trim
    if trim:
        start, end = bnd_list[1], bnd_list[-1]
        mfccs = mfccs[start:end]
        phns = phns[start:end]
        assert (len(mfccs) == len(phns))

    n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1
    # Padding or crop
    mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0)
    phns = librosa.util.fix_length(phns, n_timesteps, axis=0)

    return mfccs, phns
Example #14
0
def get_mfccs_and_phones(wav_file, trim=False, random_shuffle=False):
    '''This is applied in `train1` or `test1` phase.
    '''

    # Load
    wav = read_wav(wav_file, sr=hp.default.sr)

    mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis,
                                     hp.default.n_fft, hp.default.win_length,
                                     hp.default.hop_length)

    # timesteps
    num_timesteps = mfccs.shape[0]

    # phones (targets)
    phn_file = wav_file.replace("WAV", "PHN")
    phn2idx, idx2phn = load_vocab()
    phns = np.zeros(shape=(num_timesteps, ))
    bnd_list = []
    for line in open(phn_file, 'r').read().splitlines():
        start_point, _, phn = line.split()
        bnd = int(start_point) // hp.default.hop_length
        phns[bnd:] = phn2idx[phn]
        bnd_list.append(bnd)

    # Trim
    if trim:
        start, end = bnd_list[1], bnd_list[-1]
        mfccs = mfccs[start:end]
        phns = phns[start:end]
        assert (len(mfccs) == len(phns))

    if random_shuffle:
        for i in range(len(bnd_list) - 1):
            start = bnd_list[i]
            end = bnd_list[i + 1] - 1
            np.random.shuffle(mfccs[start:end])

    return mfccs, phns
Example #15
0
import numpy as np
import matplotlib.pyplot as plt

from audio import spec2wav, wav2spec, read_wav, write_wav


if __name__ == '__main__':

    sr = 22050
    n_fft = 512
    win_length = 400
    hop_length = 80
    duration = 2 # sec

    wav = read_wav( "H:\\cs230\\wav_x\\1_1.wav", sr, duration )
    spec, _ = wav2spec(wav, n_fft, win_length, hop_length, False)

    converted_wav = spec2wav(spec, n_fft, win_length, hop_length, 600)

    write_wav(converted_wav, sr, 'a.wav')


    plt.pcolormesh(spec)
    plt.ylabel('Frequency')
    plt.xlabel('Time')
    plt.savefig("a.png")


    ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir)

    pred_conf = PredictConfig(
        model=model,
        input_names=['x'],
        output_names=['embedding/embedding', 'prediction'],
        session_init=SaverRestore(ckpt) if ckpt else None)
    embedding_pred = OfflinePredictor(pred_conf)

    embedding, pred_speaker_id = embedding_pred(mel_spec)

    # get a random audio of the predicted speaker.
    wavfile_pred_speaker = np.array(map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id))
    length = int(hp.signal.duration * hp.signal.sr)
    wav_pred_speaker = np.array(
        map(lambda w: fix_length(read_wav(w, hp.signal.sr, duration=hp.signal.duration), length),
            wavfile_pred_speaker))

    # write audio
    tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10)
    tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10)

    # write prediction
    speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id]
    pred_speaker_name = [audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id]

    meta = [tuple(audio_meta.meta_dict[sid][k] for k in audio_meta.target_meta_field()) for sid in speaker_id] if hp.embed.meta_path else None
    pred_meta = [tuple(audio_meta_train.meta_dict[sid][k] for k in audio_meta_train.target_meta_field()) for sid in pred_speaker_id] if hp.train.meta_path else None
    prediction = ['{} ({}) -> {} ({})'.format(s, s_meta, p, p_meta)
                  for s, p, s_meta, p_meta in zip(speaker_name, pred_speaker_name, meta, pred_meta)]
    tf.summary.text('prediction', tf.convert_to_tensor(prediction))
Example #17
0
        output_names=['embedding/embedding', 'prediction'],
        session_init=SaverRestore(ckpt) if ckpt else None,
    )

    embedding_pred = OfflinePredictor(pred_conf)

    embedding, pred_speaker_id = embedding_pred(mel_spec)

    # get a random audio of the predicted speaker.
    wavfile_pred_speaker = np.array(
        map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id))
    length = int(hp.signal.duration * hp.signal.sr)
    wav_pred_speaker = np.array(
        map(
            lambda w: fix_length(
                read_wav(w, hp.signal.sr, duration=hp.signal.duration), length
            ), wavfile_pred_speaker))

    # write audio
    tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10)
    tf.summary.audio('wav_pred',
                     wav_pred_speaker,
                     hp.signal.sr,
                     max_outputs=10)

    # write prediction
    speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id]
    pred_speaker_name = [
        audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id
    ]