Ejemplo n.º 1
0
 def __init__(self, hp, args, train):
     self.hp = hp
     self.args = args
     self.train = train
     self.data = hp.data.path
     self.melgen = MelGen(hp)
     self.tierutil = TierUtil(hp)
     self.file_list = []
     self.root_dir = hp.data.path
     txt_path = os.path.join(
         self.root_dir,
         'blizzard_train.csv' if train else 'blizzard_test.csv')
     with open(txt_path, 'r') as read_obj:
         csv_reader = csv.reader(read_obj)
         headers = next(csv_reader)
         for row in csv_reader:
             [original_sentence, parsed_sentence, wav_path,
              wav_length] = row
             if float(wav_length) < hp.audio.duration and float(
                     wav_length) > 0.4:
                 self.file_list.append(wav_path)
     # Just to ensure the data always comes in the right order
     random.Random(123).shuffle(self.file_list)
     self.wavlen = int(hp.audio.sr * hp.audio.duration)
     self.tier = self.args.tier
     self.melgen = MelGen(hp)
     self.tierutil = TierUtil(hp)
Ejemplo n.º 2
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 3
0
    def __init__(self, hp, args, infer_hp):
        super(MelNet, self).__init__()
        self.hp = hp
        self.args = args
        self.infer_hp = infer_hp
        self.f_div = f_div[hp.model.tier + 1]
        self.t_div = t_div[hp.model.tier]
        self.n_mels = hp.audio.n_mels

        self.tierutil = TierUtil(hp)

        if infer_hp.conditional:
            self.tiers = [
                TTS(hp=hp,
                    freq=hp.audio.n_mels // self.f_div * f_div[1],
                    layers=hp.model.layers[0])
            ] + [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(2, hp.model.tier + 1)
            ]
        else:
            self.tiers = [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(1, hp.model.tier + 1)
            ]
        self.tiers = nn.ModuleList(
            [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers])
Ejemplo n.º 4
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = []
        # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)):
        #     wav = read_wav_np(f)
        #     duraton = (len(wav)/hp.audio.sr)
        #     if duraton < hp.audio.duration:
        #         self.file_list.append(f)
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 5
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir,
            'blizzard_train.csv' if train else 'blizzard_test.csv')
        # open file in read mode
        with open(txt_path, 'r') as read_obj:
            csv_reader = csv.reader(read_obj)
            headers = next(csv_reader)
            for row in csv_reader:
                [original_sentence, parsed_sentence, wav_path,
                 wav_length] = row
                if float(wav_length) < hp.audio.duration and float(
                        wav_length) > 0.4:
                    self.dataset.append((wav_path, parsed_sentence))
        random.Random(123).shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 6
0
def reconstruct_audio(filename, tier_to_breakdown):
    hp = HParam('./config/blizzard_compressed_experiments.yaml')
    melgen = MelGen(hp)
    tierutil = TierUtil(hp)
    final_reconstruction = None

    # Verify that tier 2 is conditionally generated from just tier 1
    assert (breakdown[2][0] == breakdown[1][1]
            ).all(), "Tier 2 not created from Tier 1"

    for tier in range(2, 7):
        source = tier_to_breakdown[tier][0]
        target = tier_to_breakdown[tier][1]

        source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        reconstructed_mel_tensor = tierutil.interleave(source_tensor,
                                                       target_tensor, tier + 1)
        reconstructed_mel = reconstructed_mel_tensor.numpy()[0]

        # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier
        if tier < 6:
            next_tier = tier_to_breakdown[tier + 1][0]
            assert (reconstructed_mel == next_tier).all(
            ), "Tier %d not created from Tier %d" % (tier + 1, tier)
        else:
            final_reconstruction = reconstructed_mel
    print('reconstructing audio...')
    reconstructed_audio = melgen.reconstruct_audio(final_reconstruction)
    melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
Ejemplo n.º 7
0
def deconstruct_audio(wav):
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  melgen = MelGen(hp)
  tierutil = TierUtil(hp)
  mel = melgen.get_normalized_mel(wav)
  tier_to_breakdown = {}
  for tier in range(1, 7):
    source, target = tierutil.cut_divide_tiers(mel, tier)
    print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape))
    tier_to_breakdown[tier] = (source, target)
  tier_to_breakdown[7] = (mel, mel)
  return tier_to_breakdown
Ejemplo n.º 8
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        if hp.data.name == 'KSS':
            with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'),
                      'r') as f:
                lines = f.read().splitlines()
                for line in tqdm(lines):
                    wav_name, _, _, text, length, _ = line.split('|')

                    wav_path = os.path.join(self.root_dir, 'kss', wav_name)
                    duraton = float(length)
                    if duraton < hp.audio.duration:
                        self.dataset.append((wav_path, text))

                # if len(self.dataset) > 100: break
        elif hp.data.name.startswith('Blizzard'):
            with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f:
                lines = f.read().splitlines()
                filenames = lines[::3]
                sentences = lines[1::3]
                for filename, sentence in tqdm(zip(filenames, sentences),
                                               total=len(filenames)):
                    wav_path = os.path.join(self.root_dir, 'wavn',
                                            filename + '.wav')
                    length = get_length(wav_path, hp.audio.sr)
                    if length < hp.audio.duration:
                        self.dataset.append((wav_path, sentence))

        else:
            raise NotImplementedError

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 9
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir, 'complete_blizzard/train_prompts.gui'
            if train else 'complete_blizzard/test_prompts.gui')
        # txt_file_list = glob.glob(
        #     os.path.join(txt_path, '**', '*.txt'),
        #     recursive=True
        # )
        # for txt_filepath in tqdm(txt_file_list, total=len(txt_file_list)):
        #     wav_filepath = txt_filepath.replace('_txt', '_wav').replace('.txt', '.wav')
        #     f = open(txt_filepath, "r")
        #     sentence = f.read().strip()
        #     f.close()
        #     # Skip the length filtering below because we already filtered the dataset
        #     length = get_length(wav_filepath, hp.audio.sr)
        #     if length < hp.audio.duration and length > 0.56 and len(sentence) > 5:
        #         self.dataset.append((wav_filepath, sentence))
        with open(txt_path, 'r') as f:
            lines = f.read().splitlines()
            wav_paths = lines[::2]
            sentences = lines[1::2]
            for wav_path, sentence in tqdm(
                    zip(wav_paths, sentences),
                    desc='Audio/text data loader for %s' % txt_path,
                    total=len(wav_paths)):
                # Skip the length filtering below because we already filtered the dataset
                # length = get_length(wav_path, hp.audio.sr)
                # if length < hp.audio.duration:
                self.dataset.append((wav_path, sentence))

        random.seed(123)
        random.shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 10
0
class AudioOnlyDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        # # Reconstruct audio for testing
        # filename = os.path.basename(self.file_list[idx])
        # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel)
        # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source)
        # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target)
        # self.melgen.save_audio('source_'+filename, wav)

        # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier)
        # reconstructed_mel = reconstructed_mel_tensor.numpy()
        # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % (
        #     mel.shape,
        #     source.shape,
        #     target.shape,
        #     reconstructed_mel.shape,
        #     ))
        # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel)
        # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio)

        return source, target
Ejemplo n.º 11
0
class AudioTextDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'),
                  'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                wav_name, _, _, text, _ = line.split('|')
                wav_name = wav_name[2:-4] + '.wav'

                wav_path = os.path.join(self.root_dir, 'wavs', wav_name)
                wav = read_wav_np(wav_path)
                duraton = (len(wav) / hp.audio.sr)
                if duraton < hp.audio.duration:
                    self.dataset.append((wav_path, text))

                #if len(self.dataset) > 100: break

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx][1]
        seq = text_to_sequence(text)

        wav = read_wav_np(self.dataset[idx][0])
        wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return seq, source, target
Ejemplo n.º 12
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
        self.file_list = []

        # if train:
        #     self.file_list = glob.glob(
        #         os.path.join(hp.data.path, 'complete_blizzard/train_wav', '**', hp.data.extension),
        #         recursive=True
        #     )
        # else:
        #     self.file_list = glob.glob(
        #         os.path.join(hp.data.path, 'complete_blizzard/test_wav', '**', hp.data.extension),
        #         recursive=True
        #     )
        txt_path = 'datasets/complete_blizzard/train_prompts.gui' if train else 'datasets/complete_blizzard/test_prompts.gui'
        with open(txt_path, 'r') as f:
            lines = f.read().splitlines()
            wav_paths = lines[::2]
            for wav_path in tqdm(wav_paths,
                                 desc='Audio data loader',
                                 total=len(wav_paths)):
                # Skip the length filtering below because we already filtered the dataset
                # length = get_length(wav_path, hp.audio.sr)
                # if length < hp.audio.duration:
                self.file_list.append(wav_path)

        # Just to ensure the data always comes in the right order
        random.seed(123)
        random.shuffle(self.file_list)

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 13
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'),
                  'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                wav_name, _, _, text, _ = line.split('|')
                wav_name = wav_name[2:-4] + '.wav'

                wav_path = os.path.join(self.root_dir, 'wavs', wav_name)
                wav = read_wav_np(wav_path)
                duraton = (len(wav) / hp.audio.sr)
                if duraton < hp.audio.duration:
                    self.dataset.append((wav_path, text))

                #if len(self.dataset) > 100: break

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Ejemplo n.º 14
0
class AudioOnlyDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = []
        # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)):
        #     wav = read_wav_np(f)
        #     duraton = (len(wav)/hp.audio.sr)
        #     if duraton < hp.audio.duration:
        #         self.file_list.append(f)
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        wav = read_wav_np(self.file_list[idx])
        wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return source, target
Ejemplo n.º 15
0
class CompleteAudioTextDatasetv3(AudioTextDataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir,
            'blizzard_train.csv' if train else 'blizzard_test.csv')
        # open file in read mode
        with open(txt_path, 'r') as read_obj:
            csv_reader = csv.reader(read_obj)
            headers = next(csv_reader)
            for row in csv_reader:
                [original_sentence, parsed_sentence, wav_path,
                 wav_length] = row
                if float(wav_length) < hp.audio.duration and float(
                        wav_length) > 0.4:
                    self.dataset.append((wav_path, parsed_sentence))
        random.Random(123).shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __getitem__(self, idx):
        sentence = self.dataset[idx][1]
        seq = seq_to_array(sentence)
        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
        return seq, source, target
Ejemplo n.º 16
0
class MelNet(nn.Module):
    def __init__(self, hp, args, infer_hp):
        super(MelNet, self).__init__()
        self.hp = hp
        self.args = args
        self.infer_hp = infer_hp
        self.f_div = f_div[hp.model.tier + 1]
        self.t_div = t_div[hp.model.tier]
        self.n_mels = hp.audio.n_mels

        self.tierutil = TierUtil(hp)

        if infer_hp.conditional:
            self.tiers = [
                TTS(hp=hp,
                    freq=hp.audio.n_mels // self.f_div * f_div[1],
                    layers=hp.model.layers[0])
            ] + [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(2, hp.model.tier + 1)
            ]
        else:
            self.tiers = [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier,
                     num_class=10) for tier in range(1, hp.model.tier + 1)
            ]
        self.tiers = nn.ModuleList(
            [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers])

    def forward(self, x, tier_num):
        assert tier_num > 0, 'tier_num should be larger than 0, got %d' % tier_num

        return self.tiers[tier_num](x)

    def sample(self, condition):
        x = None
        if condition is not None:
            # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0)
            x = condition
        else:
            seq = torch.LongTensor([[0]])
        # input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
        if x is not None:
            audio_lengths = torch.LongTensor([x.size()[-1]]).cuda()
        else:
            audio_lengths = torch.LongTensor([0]).cuda()
        ## Tier 1 ##
        tqdm.write('Tier 1')
        if self.args.timestep == 0:
            mu, std, pi = self.tiers[1](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            return temp

        for t in tqdm(range(self.args.timestep // self.t_div)):
            audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths)
                    break
                else:
                    mu, std, pi = self.tiers[1](x, audio_lengths)
                temp = sample_gmm(mu, std, pi)
                new_idx = audio_lengths.item() - 1
                x[:, m, new_idx] = temp[:, m, new_idx]

        ## Tier 2~N ##
        for tier in tqdm(range(2, self.hp.model.tier + 1)):
            tqdm.write('Tier %d' % tier)
            mu, std, pi = self.tiers[tier](x)
            temp = sample_gmm(mu, std, pi)
            x = self.tierutil.interleave(x, temp, tier + 1)

        return x

    def load_tiers(self):
        for idx, chkpt_path in enumerate(self.infer_hp.checkpoints):
            checkpoint = torch.load(chkpt_path)

            hp = load_hparam_str(checkpoint['hp_str'])

            if self.hp != hp:
                print('Warning: hp different in file %s' % chkpt_path)

            self.tiers[idx + 1].load_state_dict(checkpoint['model'])

    def sample_dependence(self, condition, label, dependence_length):
        x = None
        if condition is not None:
            # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0)
            x = condition
        else:
            seq = torch.LongTensor([[0]])
        if x is not None:
            audio_lengths = torch.LongTensor([x.size()[-1]]).cuda()
        else:
            audio_lengths = torch.LongTensor([0]).cuda()
        for t in tqdm(range(self.args.timestep // self.t_div)):
            # audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths)
                    break
                else:
                    class_label = torch.tensor(
                        label, dtype=torch.long) if isinstance(
                            label, int) else torch.LongTensor(label)
                    if m == 0:
                        mu, std, pi, h_t, h_c = self.tiers[1](
                            x[:, :, -dependence_length:],
                            audio_lengths,
                            class_label.cuda(non_blocking=True).unsqueeze(0),
                            save_hidden=True,
                            hidden_t=None,
                            hidden_c=None)
                    else:
                        mu, std, pi = self.tiers[1](
                            x[:, :, -dependence_length:],
                            audio_lengths,
                            class_label.cuda(non_blocking=True).unsqueeze(0),
                            save_hidden=False,
                            hidden_t=h_t,
                            hidden_c=h_c)
                temp = sample_gmm(mu, std, pi)
                new_idx = audio_lengths.item() - 1
                x[:, m, -1] = temp[:, m, new_idx]

        return x
Ejemplo n.º 17
0
class AudioTextDataset(Dataset):
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        if hp.data.name == 'KSS':
            with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'),
                      'r') as f:
                lines = f.read().splitlines()
                for line in tqdm(lines):
                    wav_name, _, _, text, length, _ = line.split('|')

                    wav_path = os.path.join(self.root_dir, 'kss', wav_name)
                    duraton = float(length)
                    if duraton < hp.audio.duration:
                        self.dataset.append((wav_path, text))

                # if len(self.dataset) > 100: break
        elif hp.data.name.startswith('Blizzard'):
            with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f:
                lines = f.read().splitlines()
                filenames = lines[::3]
                sentences = lines[1::3]
                for filename, sentence in tqdm(zip(filenames, sentences),
                                               total=len(filenames)):
                    wav_path = os.path.join(self.root_dir, 'wavn',
                                            filename + '.wav')
                    length = get_length(wav_path, hp.audio.sr)
                    if length < hp.audio.duration:
                        self.dataset.append((wav_path, sentence))

        else:
            raise NotImplementedError

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx][1]
        if self.hp.data.name == 'KSS':
            seq = text_to_sequence(text)
        elif self.hp.data.name.startswith('Blizzard'):
            seq = process_blizzard(text)

        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)
        # print(text)

        return seq, source, target
Ejemplo n.º 18
0
class MelNet(nn.Module):
    def __init__(self, hp, args, infer_hp):
        super(MelNet, self).__init__()
        self.hp = hp
        self.args = args
        self.infer_hp = infer_hp
        self.f_div = f_div[hp.model.tier + 1]
        self.t_div = t_div[hp.model.tier]
        self.n_mels = hp.audio.n_mels

        self.tierutil = TierUtil(hp)

        if infer_hp.conditional:
            self.tiers = [
                TTS(hp=hp,
                    freq=hp.audio.n_mels // self.f_div * f_div[1],
                    layers=hp.model.layers[0])
            ] + [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(2, hp.model.tier + 1)
            ]
        else:
            self.tiers = [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(1, hp.model.tier + 1)
            ]
        self.tiers = nn.ModuleList(
            [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers])

    def forward(self, x, tier_num):
        assert tier_num > 0, 'tier_num should be larger than 0, got %d' % tier_num

        return self.tiers[tier_num](x)

    def sample(self, condition):
        x = None
        seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0)
        input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
        audio_lengths = torch.LongTensor([0]).cuda()

        ## Tier 1 ##
        tqdm.write('Tier 1')
        for t in tqdm(range(self.args.timestep // self.t_div)):
            audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat(
                    [x,
                     torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()],
                    dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    mu, std, pi, _ = self.tiers[1](x, seq, input_lengths,
                                                   audio_lengths)
                else:
                    mu, std, pi = self.tiers[1](x, audio_lengths)
                temp = sample_gmm(mu, std, pi)
                x[:, m, t] = temp[:, m, t]

        ## Tier 2~N ##
        for tier in tqdm(range(2, self.hp.model.tier + 1)):
            tqdm.write('Tier %d' % tier)
            mu, std, pi = self.tiers[tier](x, audio_lengths)
            temp = sample_gmm(mu, std, pi)
            x = self.tierutil.interleave(x, temp, tier + 1)

        return x

    def load_tiers(self):
        for idx, chkpt_path in enumerate(self.infer_hp.checkpoints):
            checkpoint = torch.load(chkpt_path)
            hp = load_hparam_str(checkpoint['hp_str'])

            if self.hp != hp:
                print('Warning: hp different in file %s' % chkpt_path)

            # print("Looking for:")
            # print(chkpt_path)
            # print("Tier")
            # print(idx+1)
            # # print(self.tiers)
            # print(self.tiers[idx+1])

            self.tiers[idx + 1].load_state_dict(checkpoint['model'])