def my_getitem(self, idx):
        wavpath = self.wav_list[idx]
        melpath = wavpath.replace('.wav', '.mel')
        sr, audio = read_wav_np(wavpath)
        if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short:
            audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \
                    mode='constant', constant_values=0.0)

        audio = torch.from_numpy(audio).unsqueeze(0)
        mel = torch.load(melpath).squeeze(0)

        # my_tacotron2_model
        # mel = np.load(melpath)
        # mel = torch.from_numpy(mel).squeeze(0)

        if self.train:
            max_mel_start = mel.size(1) - self.mel_segment_length
            mel_start = random.randint(0, max_mel_start)
            mel_end = mel_start + self.mel_segment_length
            mel = mel[:, mel_start:mel_end]

            audio_start = mel_start * self.hp.audio.hop_length
            audio = audio[:, audio_start:audio_start +
                          self.hp.audio.segment_length]

        audio = audio + (1 / 32768) * torch.randn_like(audio)
        return mel, audio
Esempio n. 2
0
    def my_getitem(self, idx):
        wavpath = self.wav_list[idx]
        id = os.path.basename(wavpath).split(".")[0]

        mel_path = "{}/{}.npy".format(self.hp.data.mel_path, id)
        sr, audio = read_wav_np(wavpath)
        if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short:
            audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \
                    mode='constant', constant_values=0.0)

        audio = torch.from_numpy(audio).unsqueeze(0)
        # mel = torch.load(melpath).squeeze(0) # # [num_mel, T]

        mel = torch.from_numpy(np.load(mel_path))

        if self.train:
            max_mel_start = mel.size(1) - self.mel_segment_length
            mel_start = random.randint(0, max_mel_start)
            mel_end = mel_start + self.mel_segment_length
            mel = mel[:, mel_start:mel_end]

            audio_start = mel_start * self.hp.audio.hop_length
            audio = audio[:, audio_start:audio_start +
                          self.hp.audio.segment_length]

        audio = audio + (1 / 32768) * torch.randn_like(audio)
        audio = audio.squeeze(0).squeeze(0)
        return mel, audio
Esempio n. 3
0
def load_testset():
  # args = parse_train_args(['-c', './config/blizzard_compressed_experiments.yaml', '-n', 'blizzard_compressed_validation', '-t', str(tier), '-b', '1', '-s', 'TTS'])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  
  dataset = []
  raw_data = None
  with open(os.path.join(hp.data.path, 'prompts.gui'), 'r') as f:
    lines = f.read().splitlines()
    filenames = lines[::3]
    sentences = lines[1::3]
    raw_data = list(zip(filenames, sentences))
  random.seed(123)
  random.shuffle(raw_data)
  raw_data = raw_data[int(0.95 * len(raw_data)):]
  
  for filename, sentence in tqdm(raw_data, total=len(raw_data)):
      wav_path = os.path.join(hp.data.path, 'wavn', filename + '.wav')
      length = get_length(wav_path, hp.audio.sr)
      if length < hp.audio.duration:
          dataset.append((wav_path, sentence))

  for i in range(len(dataset)):
    text = dataset[i][1]
    wav = read_wav_np(dataset[i][0], sample_rate=hp.audio.sr)
    filename = os.path.basename(dataset[i][0])
    yield filename, text, wav
def main(hp, args):
    stft = TacotronSTFT(filter_length=hp.audio.filter_length,
                        hop_length=hp.audio.hop_length,
                        win_length=hp.audio.win_length,
                        n_mel_channels=hp.audio.n_mel_channels,
                        sampling_rate=hp.audio.sampling_rate,
                        mel_fmin=hp.audio.mel_fmin,
                        mel_fmax=hp.audio.mel_fmax)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'),
                          recursive=True)
    mel_path = hp.data.mel_path
    os.makedirs(mel_path, exist_ok=True)

    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        sr, wav = read_wav_np(wavpath)
        assert sr == hp.audio.sampling_rate, \
            "sample rate mismatch. expected %d, got %d at %s" % \
            (hp.audio.sampling_rate, sr, wavpath)

        if len(wav) < hp.audio.segment_length + hp.audio.pad_short:
            wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \
                    mode='constant', constant_values=0.0)

        wav = torch.from_numpy(wav).unsqueeze(0)
        mel = stft.mel_spectrogram(wav)  # mel [1, num_mel, T]
        mel = mel.squeeze(0)  # [num_mel, T]
        id = os.path.basename(wavpath).split(".")[0]
        np.save('{}/{}.npy'.format(mel_path, id),
                mel.numpy(),
                allow_pickle=False)
Esempio n. 5
0
    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx])
        wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return source, target
Esempio n. 6
0
    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        # # Reconstruct audio for testing
        # filename = os.path.basename(self.file_list[idx])
        # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel)
        # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source)
        # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target)
        # self.melgen.save_audio('source_'+filename, wav)

        # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier)
        # reconstructed_mel = reconstructed_mel_tensor.numpy()
        # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % (
        #     mel.shape,
        #     source.shape,
        #     target.shape,
        #     reconstructed_mel.shape,
        #     ))
        # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel)
        # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio)

        return source, target
Esempio n. 7
0
 def __getitem__(self, idx):
     sentence = self.dataset[idx][1]
     seq = seq_to_array(sentence)
     wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
     mel = self.melgen.get_normalized_mel(wav)
     source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
     return seq, source, target
Esempio n. 8
0
def main(hp, args):
    stft = TacotronSTFT(filter_length=hp.audio.filter_length,
                        hop_length=hp.audio.hop_length,
                        win_length=hp.audio.win_length,
                        n_mel_channels=hp.audio.n_mel_channels,
                        sampling_rate=hp.audio.sampling_rate,
                        mel_fmin=hp.audio.mel_fmin,
                        mel_fmax=hp.audio.mel_fmax)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True)

    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        try:
            sr, wav = read_wav_np(wavpath)
        except:
            continue

        assert sr == hp.audio.sampling_rate, \
            "sample rate mismatch. expected %d, got %d at %s" % \
            (hp.audio.sampling_rate, sr, wavpath)
        
        if len(wav) < hp.audio.segment_length + hp.audio.pad_short:
            wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \
                    mode='constant', constant_values=0.0)

        wav = torch.from_numpy(wav).unsqueeze(0)
        mel = stft.mel_spectrogram(wav)

        melpath = wavpath.replace('.wav', '.mel')
        torch.save(mel, melpath)
Esempio n. 9
0
def get_audio():
    hp = HParam('./config/blizzard_compressed_experiments.yaml')
    file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension),
                          recursive=True)
    random.seed(123)
    random.shuffle(file_list)
    file_list = file_list[int(0.95 * len(file_list)):]
    for idx in range(len(file_list)):
        filename = os.path.basename(file_list[idx])
        wav = read_wav_np(file_list[idx], sample_rate=hp.audio.sr)
        yield filename, wav
Esempio n. 10
0
    def __getitem__(self, idx):
        text = self.dataset[idx][1]
        if self.hp.data.name == 'KSS':
            seq = text_to_sequence(text)
        elif self.hp.data.name == 'Blizzard':
            seq = process_blizzard(text)

        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return seq, source, target
Esempio n. 11
0
def main(hp, args):
    ap = AudioProcessor(sample_rate=22050,
                        num_mels=80,
                        min_level_db=-100,
                        frame_shift_ms=None,
                        frame_length_ms=None,
                        hop_length=256,
                        win_length=1024,
                        ref_level_db=20,
                        fft_size=1024,
                        power=1.5,
                        preemphasis=0.98,
                        signal_norm=True,
                        symmetric_norm=True,
                        max_norm=4.0,
                        mel_fmin=0.0,
                        mel_fmax=8000.0,
                        spec_gain=20.0,
                        stft_pad_mode="reflect",
                        clip_norm=True,
                        griffin_lim_iters=60,
                        do_trim_silence=False,
                        trim_db=60)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'),
                          recursive=True)
    mel_path = hp.data.mel_path

    # Create all folders
    os.makedirs(mel_path, exist_ok=True)
    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        sr, wav = read_wav_np(wavpath)
        assert sr == hp.audio.sampling_rate, \
            "sample rate mismatch. expected %d, got %d at %s" % \
            (hp.audio.sampling_rate, sr, wavpath)

        if len(wav) < hp.audio.segment_length + hp.audio.pad_short:
            wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \
                    mode='constant', constant_values=0.0)

        wav = torch.from_numpy(wav).unsqueeze(0)
        wav = wav.squeeze(0)
        mel = np.float32(ap.melspectrogram(wav.detach().cpu().numpy()))
        mel = torch.from_numpy(mel)
        mel = mel.unsqueeze(0)
        mel = mel.squeeze(0)  # [num_mel, T]
        id = os.path.basename(wavpath).split(".")[0]
        np.save('{}/{}.npy'.format(mel_path, id),
                mel.numpy(),
                allow_pickle=False)
Esempio n. 12
0
def main(hp, args):
    stft = TacotronSTFT(filter_length=hp.audio.filter_length,
                        hop_length=hp.audio.hop_length,
                        win_length=hp.audio.win_length,
                        n_mel_channels=hp.audio.n_mel_channels,
                        sampling_rate=hp.audio.sampling_rate,
                        mel_fmin=hp.audio.mel_fmin,
                        mel_fmax=hp.audio.mel_fmax)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'),
                          recursive=True)

    save_train_mel_path = 'melgan_train_mel_data'
    save_val_mel_path = 'melgan_val_mel_data'
    os.makedirs(save_train_mel_path, exist_ok=True)
    os.makedirs(save_val_mel_path, exist_ok=True)

    random.shuffle(wav_files)

    count = 0
    #for wavpath in wav_files:
    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        sr, wav = read_wav_np(wavpath)
        assert sr == hp.audio.sampling_rate, \
            "sample rate mismatch. expected %d, got %d at %s" % \
            (hp.audio.sampling_rate, sr, wavpath)

        if len(wav) < hp.audio.segment_length + hp.audio.pad_short:
            wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \
                    mode='constant', constant_values=0.0)

        wav = torch.from_numpy(wav).unsqueeze(0)
        mel = stft.mel_spectrogram(wav)

        wav_name = wavpath.split('/')[5]
        melpath = wavpath.replace('.wav', '.mel')
        mel_name = melpath.split('/')[5]

        if count < 300:
            final_mel_path = os.path.join(save_val_mel_path, mel_name)
            final_wav_path = os.path.join(save_val_mel_path, wav_name)
        else:
            final_mel_path = os.path.join(save_train_mel_path, mel_name)
            final_wav_path = os.path.join(save_train_mel_path, wav_name)
        torch.save(mel, final_mel_path)
        shutil.copy(wavpath, final_wav_path)
        count += 1
Esempio n. 13
0
    def my_getitem(self, idx):
        wavpath = self.wav_list[idx]
        melpath = wavpath.replace('.wav', '.mel')
        sr, audio = read_wav_np(wavpath)
        audio = torch.from_numpy(audio).unsqueeze(0)
        mel = torch.load(melpath).squeeze(0)

        if self.train:
            max_mel_start = mel.size(1) - self.mel_segment_length
            mel_start = random.randint(0, max_mel_start)
            mel_end = mel_start + self.mel_segment_length
            mel = mel[:, mel_start:mel_end]

            audio_start = mel_start * self.hp.audio.hop_length
            audio = audio[:, audio_start:audio_start +
                          self.hp.audio.segment_length]

        audio = audio + (1 / 32768) * torch.randn_like(audio)
        return mel, audio
Esempio n. 14
0
def main(hp, args):
    stft = TacotronSTFT(filter_length=hp.audio.filter_length,
                        hop_length=hp.audio.hop_length,
                        win_length=hp.audio.win_length,
                        n_mel_channels=hp.audio.n_mel_channels,
                        sampling_rate=hp.audio.sampling_rate,
                        mel_fmin=hp.audio.mel_fmin,
                        mel_fmax=hp.audio.mel_fmax)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'),
                          recursive=True)

    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        sr, wav = read_wav_np(wavpath)
        assert sr == hp.audio.sampling_rate, \
            "sample rate mismatch. expected %d, got %d at %s" % \
            (hp.audio.sampling_rate, sr, wavpath)

        wav = torch.from_numpy(wav).unsqueeze(0)
        mel = stft.mel_spectrogram(wav)

        melpath = wavpath.replace('.wav', '.mel')
        torch.save(mel, melpath)
Esempio n. 15
0
def main(args, hp):
    stft = TacotronSTFT(filter_length=hp.filter_length,
                        hop_length=hp.hop_length,
                        win_length=hp.win_length,
                        n_mel_channels=hp.n_mel_channels,
                        sampling_rate=hp.sampling_rate,
                        mel_fmin=hp.mel_fmin,
                        mel_fmax=hp.mel_fmax)

    wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'),
                          recursive=True)
    mel_path = hp.data_path
    os.makedirs(mel_path, exist_ok=True)
    print("Sample Rate : ", hp.sampling_rate)
    for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'):
        sr, wav = read_wav_np(wavpath, hp.sampling_rate)
        wav = torch.from_numpy(wav).unsqueeze(0)
        mel, mag = stft.mel_spectrogram(
            wav)  # mel [1, 80, T]  mag [1, num_mag, T]
        mel = mel.squeeze(0)  # [num_mel, T]
        id = os.path.basename(wavpath).split(".")[0]
        np.save('{}/{}.npy'.format(mel_path, id),
                mel.numpy(),
                allow_pickle=False)
Esempio n. 16
0
def main(args):
    stft = TacotronSTFT(filter_length=hp.n_fft,
                        hop_length=hp.hop_length,
                        win_length=hp.win_length,
                        n_mel_channels=hp.n_mels,
                        sampling_rate=hp.sampling_rate,
                        mel_fmin=hp.fmin,
                        mel_fmax=hp.fmax)
    # wav_file loacation
    wav_files = glob.glob(os.path.join(args.wav_root_path, '**', '*.wav'),
                          recursive=True)

    #Define all the paths correesponding to the feature
    text_path = os.path.join(hp.data_path, 'text')
    mel_path = os.path.join(hp.data_path, 'mels')
    duration_path = os.path.join(hp.data_path, 'alignment')
    energy_path = os.path.join(hp.data_path, 'energy')
    pitch_path = os.path.join(hp.data_path, 'pitch')
    symbol_path = os.path.join(hp.data_path, 'symbol')

    # create directory if doesnt exist
    os.makedirs(text_path, exist_ok=True)
    os.makedirs(duration_path, exist_ok=True)
    os.makedirs(mel_path, exist_ok=True)
    os.makedirs(energy_path, exist_ok=True)
    os.makedirs(pitch_path, exist_ok=True)
    os.makedirs(symbol_path, exist_ok=True)

    for wavpath in tqdm.tqdm(wav_files,
                             desc='preprocess wav to mel, energy, and pitch'):
        sr, wav = read_wav_np(wavpath)
        p = pitch(wav)  # [T, ] T = Number of frames
        wav = torch.from_numpy(wav).unsqueeze(0)
        mel, mag = stft.mel_spectrogram(
            wav)  # mel [1, 80, T]  mag [1, num_mag, T]
        mel = mel.squeeze(0)  # [num_mel, T]
        mag = mag.squeeze(0)  # [num_mag, T]
        e = torch.norm(mag, dim=0)  # [T, ]
        p = p[:mel.shape[1]]
        p = np.array(p, dtype='float32')
        id = os.path.basename(wavpath).split(".")[0]

        # save the features
        np.save('{}/{}.npy'.format(mel_path, id),
                mel.numpy(),
                allow_pickle=False)
        np.save('{}/{}.npy'.format(energy_path, id),
                e.numpy(),
                allow_pickle=False)
        np.save('{}/{}.npy'.format(pitch_path, id), p, allow_pickle=False)

    with open(hp.filelist_alignment_dir + "alignment.txt",
              encoding='utf-8') as f:  #add all 13100 examples to filelist.txt
        for lines in f:
            content = lines.split('|')
            id = content[4].split()[0].split('.')[0]
            if os.path.exists(os.path.join(args.wav_root_path, id + '.wav')):
                text = content[0]
                duration = content[2]
                duration = duration.split()
                dur = np.array(duration, dtype='float32')  #done
                phoneme = content[3]
                symbol_sequence = phonemes_to_sequence(phoneme)

                np.save(
                    '{}/{}.npy'.format(text_path, id), (text, phoneme),
                    allow_pickle=False)  #what is the input text or phonemen???
                np.save('{}/{}.npy'.format(duration_path, id),
                        dur,
                        allow_pickle=False)
                np.save('{}/{}.npy'.format(symbol_path, id),
                        symbol_sequence,
                        allow_pickle=False)
Esempio n. 17
0
    spectrogram1 = plot_spectrogram_to_numpy(final_mel)
    plt.imsave(os.path.join('validation_tests', filename + '_generated.png'),
               spectrogram1.transpose((1, 2, 0)))
    spectrogram2 = plot_spectrogram_to_numpy(target_mel)
    plt.imsave(os.path.join('validation_tests', filename + '_target.png'),
               spectrogram2.transpose((1, 2, 0)))


# breakdown = None
# audio_files = get_audio()
# for filename, wav in audio_files:
#   breakdown = deconstruct_audio(wav)
#   reconstruct_audio(filename, breakdown)
#   hp = HParam('./config/blizzard_alldata_v5.yaml')
#   melgen = MelGen(hp)
#   melgen.save_audio('original_'+filename, wav)
#   print('')
#   print('')
#   break

# First deconstruct the wav file
filename = os.path.basename(WAV_FILE)
wav = read_wav_np(WAV_FILE, sample_rate=22050)
tier_to_breakdown = deconstruct_audio(wav)
# Now run inference
final_mel = run_inference(SENTENCE, timestep, tier_to_breakdown)

# target_mel = tier_to_breakdown[7][0]
# save_images(final_mel, target_mel, filename)
# save_audio('generated_'+filename, final_mel)
# save_audio('target_'+filename, target_mel)