Beispiel #1
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir,
            'blizzard_train.csv' if train else 'blizzard_test.csv')
        # open file in read mode
        with open(txt_path, 'r') as read_obj:
            csv_reader = csv.reader(read_obj)
            headers = next(csv_reader)
            for row in csv_reader:
                [original_sentence, parsed_sentence, wav_path,
                 wav_length] = row
                if float(wav_length) < hp.audio.duration and float(
                        wav_length) > 0.4:
                    self.dataset.append((wav_path, parsed_sentence))
        random.Random(123).shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #2
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = []
        # for i, f in enumerate(glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True)):
        #     wav = read_wav_np(f)
        #     duraton = (len(wav)/hp.audio.sr)
        #     if duraton < hp.audio.duration:
        #         self.file_list.append(f)
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #3
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.file_list = glob.glob(os.path.join(hp.data.path, '**',
                                                hp.data.extension),
                                   recursive=True)

        random.seed(123)
        random.shuffle(self.file_list)
        if train:
            self.file_list = self.file_list[:int(0.95 * len(self.file_list))]
        else:
            self.file_list = self.file_list[int(0.95 * len(self.file_list)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #4
0
 def __init__(self, hp, args, train):
     self.hp = hp
     self.args = args
     self.train = train
     self.data = hp.data.path
     self.melgen = MelGen(hp)
     self.tierutil = TierUtil(hp)
     self.file_list = []
     self.root_dir = hp.data.path
     txt_path = os.path.join(
         self.root_dir,
         'blizzard_train.csv' if train else 'blizzard_test.csv')
     with open(txt_path, 'r') as read_obj:
         csv_reader = csv.reader(read_obj)
         headers = next(csv_reader)
         for row in csv_reader:
             [original_sentence, parsed_sentence, wav_path,
              wav_length] = row
             if float(wav_length) < hp.audio.duration and float(
                     wav_length) > 0.4:
                 self.file_list.append(wav_path)
     # Just to ensure the data always comes in the right order
     random.Random(123).shuffle(self.file_list)
     self.wavlen = int(hp.audio.sr * hp.audio.duration)
     self.tier = self.args.tier
     self.melgen = MelGen(hp)
     self.tierutil = TierUtil(hp)
Beispiel #5
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        if hp.data.name == 'KSS':
            with open(os.path.join(self.root_dir, 'transcript.v.1.4.txt'),
                      'r') as f:
                lines = f.read().splitlines()
                for line in tqdm(lines):
                    wav_name, _, _, text, length, _ = line.split('|')

                    wav_path = os.path.join(self.root_dir, 'kss', wav_name)
                    duraton = float(length)
                    if duraton < hp.audio.duration:
                        self.dataset.append((wav_path, text))

                # if len(self.dataset) > 100: break
        elif hp.data.name.startswith('Blizzard'):
            with open(os.path.join(self.root_dir, 'prompts.gui'), 'r') as f:
                lines = f.read().splitlines()
                filenames = lines[::3]
                sentences = lines[1::3]
                for filename, sentence in tqdm(zip(filenames, sentences),
                                               total=len(filenames)):
                    wav_path = os.path.join(self.root_dir, 'wavn',
                                            filename + '.wav')
                    length = get_length(wav_path, hp.audio.sr)
                    if length < hp.audio.duration:
                        self.dataset.append((wav_path, sentence))

        else:
            raise NotImplementedError

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #6
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []

        txt_path = os.path.join(
            self.root_dir, 'complete_blizzard/train_prompts.gui'
            if train else 'complete_blizzard/test_prompts.gui')
        # txt_file_list = glob.glob(
        #     os.path.join(txt_path, '**', '*.txt'),
        #     recursive=True
        # )
        # for txt_filepath in tqdm(txt_file_list, total=len(txt_file_list)):
        #     wav_filepath = txt_filepath.replace('_txt', '_wav').replace('.txt', '.wav')
        #     f = open(txt_filepath, "r")
        #     sentence = f.read().strip()
        #     f.close()
        #     # Skip the length filtering below because we already filtered the dataset
        #     length = get_length(wav_filepath, hp.audio.sr)
        #     if length < hp.audio.duration and length > 0.56 and len(sentence) > 5:
        #         self.dataset.append((wav_filepath, sentence))
        with open(txt_path, 'r') as f:
            lines = f.read().splitlines()
            wav_paths = lines[::2]
            sentences = lines[1::2]
            for wav_path, sentence in tqdm(
                    zip(wav_paths, sentences),
                    desc='Audio/text data loader for %s' % txt_path,
                    total=len(wav_paths)):
                # Skip the length filtering below because we already filtered the dataset
                # length = get_length(wav_path, hp.audio.sr)
                # if length < hp.audio.duration:
                self.dataset.append((wav_path, sentence))

        random.seed(123)
        random.shuffle(self.dataset)
        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #7
0
    def __init__(self, hp, args, infer_hp):
        super(MelNet, self).__init__()
        self.hp = hp
        self.args = args
        self.infer_hp = infer_hp
        self.f_div = f_div[hp.model.tier + 1]
        self.t_div = t_div[hp.model.tier]
        self.n_mels = hp.audio.n_mels

        self.tierutil = TierUtil(hp)

        if infer_hp.conditional:
            self.tiers = [
                TTS(hp=hp,
                    freq=hp.audio.n_mels // self.f_div * f_div[1],
                    layers=hp.model.layers[0])
            ] + [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(2, hp.model.tier + 1)
            ]
        else:
            self.tiers = [
                Tier(hp=hp,
                     freq=hp.audio.n_mels // self.f_div * f_div[tier],
                     layers=hp.model.layers[tier - 1],
                     tierN=tier) for tier in range(1, hp.model.tier + 1)
            ]
        self.tiers = nn.ModuleList(
            [None] + [nn.DataParallel(tier).cuda() for tier in self.tiers])
def reconstruct_audio(filename, tier_to_breakdown):
    hp = HParam('./config/blizzard_compressed_experiments.yaml')
    melgen = MelGen(hp)
    tierutil = TierUtil(hp)
    final_reconstruction = None

    # Verify that tier 2 is conditionally generated from just tier 1
    assert (breakdown[2][0] == breakdown[1][1]
            ).all(), "Tier 2 not created from Tier 1"

    for tier in range(2, 7):
        source = tier_to_breakdown[tier][0]
        target = tier_to_breakdown[tier][1]

        source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        reconstructed_mel_tensor = tierutil.interleave(source_tensor,
                                                       target_tensor, tier + 1)
        reconstructed_mel = reconstructed_mel_tensor.numpy()[0]

        # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier
        if tier < 6:
            next_tier = tier_to_breakdown[tier + 1][0]
            assert (reconstructed_mel == next_tier).all(
            ), "Tier %d not created from Tier %d" % (tier + 1, tier)
        else:
            final_reconstruction = reconstructed_mel
    print('reconstructing audio...')
    reconstructed_audio = melgen.reconstruct_audio(final_reconstruction)
    melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
def deconstruct_audio(wav):
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  melgen = MelGen(hp)
  tierutil = TierUtil(hp)
  mel = melgen.get_normalized_mel(wav)
  tier_to_breakdown = {}
  for tier in range(1, 7):
    source, target = tierutil.cut_divide_tiers(mel, tier)
    print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape))
    tier_to_breakdown[tier] = (source, target)
  tier_to_breakdown[7] = (mel, mel)
  return tier_to_breakdown
Beispiel #10
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
        self.file_list = []

        # if train:
        #     self.file_list = glob.glob(
        #         os.path.join(hp.data.path, 'complete_blizzard/train_wav', '**', hp.data.extension),
        #         recursive=True
        #     )
        # else:
        #     self.file_list = glob.glob(
        #         os.path.join(hp.data.path, 'complete_blizzard/test_wav', '**', hp.data.extension),
        #         recursive=True
        #     )
        txt_path = 'datasets/complete_blizzard/train_prompts.gui' if train else 'datasets/complete_blizzard/test_prompts.gui'
        with open(txt_path, 'r') as f:
            lines = f.read().splitlines()
            wav_paths = lines[::2]
            for wav_path in tqdm(wav_paths,
                                 desc='Audio data loader',
                                 total=len(wav_paths)):
                # Skip the length filtering below because we already filtered the dataset
                # length = get_length(wav_path, hp.audio.sr)
                # if length < hp.audio.duration:
                self.file_list.append(wav_path)

        # Just to ensure the data always comes in the right order
        random.seed(123)
        random.shuffle(self.file_list)

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)
Beispiel #11
0
    def __init__(self, hp, args, train):
        self.hp = hp
        self.args = args
        self.train = train
        self.data = hp.data.path
        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)

        # this will search all files within hp.data.path
        self.root_dir = hp.data.path
        self.dataset = []
        with open(os.path.join(self.root_dir, 'transcript.v.1.2.txt'),
                  'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                wav_name, _, _, text, _ = line.split('|')
                wav_name = wav_name[2:-4] + '.wav'

                wav_path = os.path.join(self.root_dir, 'wavs', wav_name)
                wav = read_wav_np(wav_path)
                duraton = (len(wav) / hp.audio.sr)
                if duraton < hp.audio.duration:
                    self.dataset.append((wav_path, text))

                #if len(self.dataset) > 100: break

        random.seed(123)
        random.shuffle(self.dataset)
        if train:
            self.dataset = self.dataset[:int(0.95 * len(self.dataset))]
        else:
            self.dataset = self.dataset[int(0.95 * len(self.dataset)):]

        self.wavlen = int(hp.audio.sr * hp.audio.duration)
        self.tier = self.args.tier

        self.melgen = MelGen(hp)
        self.tierutil = TierUtil(hp)