Exemple #1
0
class AudioOnlyDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        # # Reconstruct audio for testing
        # filename = os.path.basename(self.file_list[idx])
        # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel)
        # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source)
        # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target)
        # self.melgen.save_audio('source_'+filename, wav)

        # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier)
        # reconstructed_mel = reconstructed_mel_tensor.numpy()
        # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % (
        #     mel.shape,
        #     source.shape,
        #     target.shape,
        #     reconstructed_mel.shape,
        #     ))
        # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel)
        # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio)

        return source, target
def deconstruct_audio(wav):
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  melgen = MelGen(hp)
  tierutil = TierUtil(hp)
  mel = melgen.get_normalized_mel(wav)
  tier_to_breakdown = {}
  for tier in range(1, 7):
    source, target = tierutil.cut_divide_tiers(mel, tier)
    print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape))
    tier_to_breakdown[tier] = (source, target)
  tier_to_breakdown[7] = (mel, mel)
  return tier_to_breakdown
Exemple #3
0
class AudioTextDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'),
                  'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                wav_name, _, _, text, _ = line.split('|')
                wav_name = wav_name[2:-4] + '.wav'

                wav_path = os.path.join(self.root_dir, 'wavs', wav_name)
                wav = read_wav_np(wav_path)
                duraton = (len(wav) / hp.audio.sr)
                if duraton < hp.audio.duration:
                    self.dataset.append((wav_path, text))

                #if len(self.dataset) > 100: break

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx][1]
        seq = text_to_sequence(text)

        wav = read_wav_np(self.dataset[idx][0])
        wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return seq, source, target
Exemple #4
0
class AudioOnlyDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = []
        # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)):
        #     wav = read_wav_np(f)
        #     duraton = (len(wav)/hp.audio.sr)
        #     if duraton < hp.audio.duration:
        #         self.file_list.append(f)
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx])
        wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return source, target
Exemple #5
0
class CompleteAudioTextDatasetv3(AudioTextDataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir,
            'blizzard_train.csv' if train else 'blizzard_test.csv')
        # open file in read mode
        with open(txt_path, 'r') as read_obj:
            csv_reader = csv.reader(read_obj)
            headers = next(csv_reader)
            for row in csv_reader:
                [original_sentence, parsed_sentence, wav_path,
                 wav_length] = row
                if float(wav_length) < hp.audio.duration and float(
                        wav_length) > 0.4:
                    self.dataset.append((wav_path, parsed_sentence))
        random.Random(123).shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __getitem__(self, idx):
        sentence = self.dataset[idx][1]
        seq = seq_to_array(sentence)
        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
        return seq, source, target
Exemple #6
0
class AudioTextDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        if hp.data.name == 'KSS':
            with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'),
                      'r') as f:
                lines = f.read().splitlines()
                for line in tqdm(lines):
                    wav_name, _, _, text, length, _ = line.split('|')

                    wav_path = os.path.join(self.root_dir, 'kss', wav_name)
                    duraton = float(length)
                    if duraton < hp.audio.duration:
                        self.dataset.append((wav_path, text))

                # if len(self.dataset) > 100: break
        elif hp.data.name.startswith('Blizzard'):
            with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f:
                lines = f.read().splitlines()
                filenames = lines[::3]
                sentences = lines[1::3]
                for filename, sentence in tqdm(zip(filenames, sentences),
                                               total=len(filenames)):
                    wav_path = os.path.join(self.root_dir, 'wavn',
                                            filename + '.wav')
                    length = get_length(wav_path, hp.audio.sr)
                    if length < hp.audio.duration:
                        self.dataset.append((wav_path, sentence))

        else:
            raise NotImplementedError

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx][1]
        if self.hp.data.name == 'KSS':
            seq = text_to_sequence(text)
        elif self.hp.data.name.startswith('Blizzard'):
            seq = process_blizzard(text)

        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
        # print(text)

        return seq, source, target