Exemple #1
0
    def do_test_generate(self,
                         paths,
                         step,
                         data_path,
                         test_index,
                         deterministic=False,
                         use_half=False,
                         verbose=False):
        k = step // 1000
        test_index = [
            x[:2] if len(x) > 0 else [] for i, x in enumerate(test_index)
        ]
        dataset = env.MultispeakerDataset(test_index, data_path)
        loader = DataLoader(dataset, shuffle=False)
        data = [x for x in loader]
        n_points = len(data)
        gt = [(x[0].float() + 0.5) / (2**15 - 0.5) for speaker, x in data]
        extended = [
            np.concatenate([
                np.zeros(self.pad_left_encoder(), dtype=np.float32), x,
                np.zeros(self.pad_right(), dtype=np.float32)
            ]) for x in gt
        ]
        speakers = [
            torch.FloatTensor(speaker[0].float()) for speaker, x in data
        ]
        maxlen = max([len(x) for x in extended])
        aligned = [
            torch.cat([torch.FloatTensor(x),
                       torch.zeros(maxlen - len(x))]) for x in extended
        ]
        os.makedirs(paths.gen_path(), exist_ok=True)
        out, _, _, _ = self.forward_generate(torch.stack(
            speakers + list(reversed(speakers)), dim=0).cuda(),
                                             torch.stack(aligned + aligned,
                                                         dim=0).cuda(),
                                             verbose=verbose,
                                             use_half=use_half)

        logger.log(f'out: {out.size()}')
        for i, x in enumerate(gt):
            librosa.output.write_wav(
                f'{paths.gen_path()}/{k}k_steps_{i}_target.wav',
                x.cpu().numpy(),
                sr=sample_rate)
            audio = out[i][:len(x)].cpu().numpy()
            librosa.output.write_wav(
                f'{paths.gen_path()}/{k}k_steps_{i}_generated.wav',
                audio,
                sr=sample_rate)
            audio_tr = out[n_points + i][:len(x)].cpu().numpy()
            librosa.output.write_wav(
                f'{paths.gen_path()}/{k}k_steps_{i}_transferred.wav',
                audio_tr,
                sr=sample_rate)
Exemple #2
0
    def do_generate(self,
                    paths,
                    step,
                    data_path,
                    test_index,
                    deterministic=False,
                    use_half=False,
                    verbose=False):
        """Speech generation from command-line (not during test)
        """
        k = step // 1000
        test_index = [
            x[:10] if len(x) > 0 else [] for i, x in enumerate(test_index)
        ]
        test_index[0] = []
        test_index[1] = []
        test_index[2] = []
        # test_index[3] = []

        dataset = env.MultispeakerDataset(test_index, data_path)
        loader = DataLoader(dataset, shuffle=False)
        data = [x for x in loader]
        n_points = len(data)
        gt = [(x[0].float() + 0.5) / (2**15 - 0.5) for speaker, x in data]
        extended = [
            np.concatenate([
                np.zeros(self.pad_left_encoder(), dtype=np.float32), x,
                np.zeros(self.pad_right(), dtype=np.float32)
            ]) for x in gt
        ]
        speakers = [
            torch.FloatTensor(speaker[0].float()) for speaker, x in data
        ]

        vc_speakers = [
            torch.FloatTensor((np.arange(30) == 1).astype(np.float))
            for _ in range(10)
        ]
        # vc_speakers = [torch.FloatTensor((np.arange(30) == 14).astype(np.float)) for _ in range(20)]
        # vc_speakers = [torch.FloatTensor((np.arange(30) == 23).astype(np.float)) for _ in range(20)]
        # vc_speakers = [torch.FloatTensor((np.arange(30) == 4).astype(np.float)) for _ in range(20)]
        maxlen = max([len(x) for x in extended])
        aligned = [
            torch.cat([torch.FloatTensor(x),
                       torch.zeros(maxlen - len(x))]) for x in extended
        ]
        os.makedirs(paths.gen_dir(), exist_ok=True)
        # out = self.forward_generate(torch.stack(speakers + list(reversed(speakers)), dim=0).cuda(), torch.stack(aligned + aligned, dim=0).cuda(), verbose=verbose, use_half=use_half)
        out = self.forward_generate(torch.stack(vc_speakers, dim=0).cuda(),
                                    torch.stack(aligned, dim=0).cuda(),
                                    verbose=verbose,
                                    use_half=use_half)
        # for i, x in enumerate(gt) :
        #     librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_target.wav', x.cpu().numpy(), sr=sample_rate)
        #     audio = out[i][:len(x)].cpu().numpy()
        #     librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate)
        #     audio_tr = out[n_points+i][:len(x)].cpu().numpy()
        #     librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate)

        for i, x in enumerate(gt):
            # librosa.output.write_wav(f'{paths.gen_dir()}/gsb_{i+1:04d}.wav', x.cpu().numpy(), sr=sample_rate)
            # librosa.output.write_wav(f'{paths.gen_dir()}/gt_gsb_{i+1:03d}.wav', x.cpu().numpy(), sr=sample_rate)
            # audio = out[i][:len(x)].cpu().numpy()
            # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate)
            # audio_tr = out[n_points+i][:len(x)].cpu().numpy()
            audio_tr = out[i][:self.pad_left_encoder() + len(x)].cpu().numpy()
            # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate)
            librosa.output.write_wav(f'{paths.gen_dir()}/gsb_{i + 1:04d}.wav',
                                     audio_tr,
                                     sr=sample_rate)
Exemple #3
0
    def do_test(self, writer, epoch, step, data_path, test_index):
        dataset = env.MultispeakerDataset(test_index, data_path)
        criterion = nn.NLLLoss().cuda()
        # k = 0
        # saved_k = 0
        pad_left = self.pad_left()
        pad_left_encoder = self.pad_left_encoder()
        pad_left_decoder = self.pad_left_decoder()
        extra_pad_right = 0
        pad_right = self.pad_right() + extra_pad_right
        window = 16 * self.total_scale()

        test_loader = DataLoader(
            dataset,
            collate_fn=lambda batch: env.collate_multispeaker_samples(
                pad_left, window, pad_right, batch),
            batch_size=16,
            num_workers=2,
            shuffle=False,
            pin_memory=True)

        running_loss_c = 0.
        running_loss_f = 0.
        running_loss_vq = 0.
        running_loss_vqc = 0.
        running_entropy = 0.
        running_max_grad = 0.
        running_max_grad_name = ""

        for i, (speaker, wave16) in enumerate(test_loader):
            speaker = speaker.cuda()
            wave16 = wave16.cuda()

            coarse = (wave16 + 2**15) // 256
            fine = (wave16 + 2**15) % 256

            coarse_f = coarse.float() / 127.5 - 1.
            fine_f = fine.float() / 127.5 - 1.
            total_f = (wave16.float() + 0.5) / 32767.5

            noisy_f = total_f

            x = torch.cat([
                coarse_f[:,
                         pad_left - pad_left_decoder:-pad_right].unsqueeze(-1),
                fine_f[:,
                       pad_left - pad_left_decoder:-pad_right].unsqueeze(-1),
                coarse_f[:, pad_left - pad_left_decoder + 1:1 -
                         pad_right].unsqueeze(-1),
            ],
                          dim=2)
            y_coarse = coarse[:, pad_left + 1:1 - pad_right]
            y_fine = fine[:, pad_left + 1:1 - pad_right]

            translated = noisy_f[:, pad_left - pad_left_encoder:]

            p_cf, vq_pen, encoder_pen, entropy = self(speaker, x, translated)
            p_c, p_f = p_cf
            loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse)
            loss_f = criterion(p_f.transpose(1, 2).float(), y_fine)
            # encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1))
            # loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen

            running_loss_c += loss_c.item()
            running_loss_f += loss_f.item()
            running_loss_vq += vq_pen.item()
            running_loss_vqc += encoder_pen.item()
            running_entropy += entropy

        avg_loss_c = running_loss_c / (i + 1)
        avg_loss_f = running_loss_f / (i + 1)
        avg_loss_vq = running_loss_vq / (i + 1)
        avg_loss_vqc = running_loss_vqc / (i + 1)
        avg_entropy = running_entropy / (i + 1)

        k = step // 1000

        # tensorboard writer
        writer.add_scalars(
            'Test/loss_group', {
                'loss_c': avg_loss_c,
                'loss_f': avg_loss_f,
                'vq': avg_loss_vq,
                'vqc': avg_loss_vqc,
                'entropy': avg_entropy,
            }, step - 1)
Exemple #4
0
        index = pickle.load(f)

    logger.log(f"len of vctk index pkl object is {len(index)}"
               )  # should be equal to total number of speakers in the dataset
    # logger.log(f"index.pkl file --- index[:5] {index[:5]}")
    # logger.log(f"index.pkl file --- index[0][:5] {index[0][:5]}")

    test_index = [
        x[:args.test_utts_per_speaker] if i < args.test_speakers else []
        for i, x in enumerate(index)
    ]  # take first 30 utts from args.test_speakers speakers as test data
    train_index = [
        x[args.test_utts_per_speaker:] if i < args.test_speakers else x
        for i, x in enumerate(index)
    ]  # rest of utts are training data from each speaker
    dataset = env.MultispeakerDataset(train_index, data_path)
elif dataset_type == 'single':
    data_path = config.single_speaker_data_path
    with open(f'{data_path}/dataset_ids.pkl', 'rb') as f:
        index = pickle.load(f)
    test_index = index[-args.test_speakers:] + index[:args.test_speakers]
    train_index = index[:-args.test_speakers]
    dataset = env.AudiobookDataset(train_index, data_path)
else:
    raise RuntimeError('bad dataset type')

print(f'dataset size: {len(dataset)}')

model = model_fn(dataset)

if use_half:
Exemple #5
0
    def do_generate(self,
                    paths,
                    data_path,
                    index,
                    test_speakers,
                    test_utts_per_speaker,
                    use_half=False,
                    verbose=False,
                    only_discrete=False):

        # Set the speaker to generate for each utterance
        # speaker_id = 1  # the speaker id to condition the model on for generation # TODO make this a CLA?

        # Get the utts we have chosen to generate from 'index'
        # 'index' contains ALL utts in dataset
        test_index = []
        for i, x in enumerate(index):
            if test_speakers == 0 or i < test_speakers:
                if test_utts_per_speaker == 0:
                    # if test_utts_per_speaker is 0, then use ALL utts for the speaker
                    test_index.append(x)
                else:
                    test_index.append(x[:test_utts_per_speaker])
            else:
                test_index.append(
                    []
                )  # done so that speaker one hots are created of correct dimension

        # test_index = [x[:test_utts_per_speaker] if len(x) > 0 else [] for i, x in enumerate(test_index)]

        # logger.log('second:')
        # logger.log(test_index)

        # make containing directories
        os.makedirs(f'{paths.gen_path()}embeddings', exist_ok=True)
        os.makedirs(f'{paths.gen_path()}vqvae_tokens', exist_ok=True)
        os.makedirs(f'{paths.gen_path()}decoder_input_vectors', exist_ok=True)

        # TODO Save embedding matrix to disk for plotting and analysis
        torch.save(self.vq.embedding0.clone().detach(),
                   f'{paths.gen_path()}embeddings/vqvae_codebook.pt')

        dataset = env.MultispeakerDataset(test_index,
                                          data_path,
                                          return_filename=True)
        loader = DataLoader(dataset, batch_size=1, shuffle=False)

        for speaker, x, filename in loader:  # NB!!! Following code in for loop is only designed for batch size == 1 for now

            print("speaker.size()", speaker.size())
            print("x.size()", x.size())
            print("filename", filename)

            # data = [x for x in loader]

            # logger.log("data:")
            # logger.log(f"len(data) = {len(data)}")
            # logger.log(f"data[0]: {data[0]}")

            # n_points = len(data)
            # gt = [(x[0].float() + 0.5) / (2 ** 15 - 0.5) for speaker, x, filename in data]
            # extended = [np.concatenate(
            #     [np.zeros(self.pad_left_encoder(), dtype=np.float32), x, np.zeros(self.pad_right(), dtype=np.float32)]) for
            #             x in gt]

            gt = (x[0].float() + 0.5) / (2**15 - 0.5)
            extended = np.concatenate([
                np.zeros(self.pad_left_encoder(), dtype=np.float32), gt,
                np.zeros(self.pad_right(), dtype=np.float32)
            ])

            # TODO use speaker id from dataset
            speakers = [
                torch.FloatTensor(speaker[0].float())
            ]  # TODO seems to only have 3 speakers? As per the CLA. look at dataset...

            total_test_utts = test_speakers * test_utts_per_speaker
            print("test_speakers", test_speakers)
            print("test_utts_per_speaker", test_utts_per_speaker)

            # (np.arange(30) == 1) is a one hot conditioning vector indicating speaker 2
            # vc_speakers = [torch.FloatTensor((np.arange(30) == speaker_id).astype(np.float)) for _ in range(total_test_utts)]
            # speakers = vc_speakers

            print("speakers:")
            print("speakers", speakers)
            print("len(speakers)", len(speakers))
            print("speakers[0].size()", speakers[0].size())
            print("torch.stack(speakers, dim=0).size()",
                  torch.stack(speakers, dim=0).size())

            # maxlen = max([len(x) for x in extended])
            print("extended.shape", extended.shape)
            maxlen = len(extended)

            # aligned = [torch.cat([torch.FloatTensor(x), torch.zeros(maxlen - len(x))]) for x in extended]
            aligned = [torch.FloatTensor(extended)]
            print("torch.stack(aligned, dim=0).size()",
                  torch.stack(aligned, dim=0).size())

            # out = self.forward_generate(torch.stack(speakers + list(reversed(speakers)), dim=0).cuda(), torch.stack(aligned + aligned, dim=0).cuda(), verbose=verbose, use_half=use_half, only_discrete=only_discrete)
            out, discrete, index_atom, index_group = self.forward_generate(
                torch.stack(speakers, dim=0).cuda(),
                torch.stack(aligned, dim=0).cuda(),
                verbose=verbose,
                use_half=use_half,
                only_discrete=only_discrete)

            if out is not None:
                logger.log(f'out[0]: {out[0]}')
                logger.log(f'out: {out.size()}')
            logger.log(f'index_atom.size(): {index_atom.size()}')
            # logger.log(f'index_atom[0]: {index_atom[0]}')
            logger.log(f'index_atom[0].size(): {index_atom[0].size()}')
            logger.log(f'index_group.size(): {index_group.size()}')
            # logger.log(f'index_group[0]: {index_group[0]}')
            logger.log(f'index_group[0].size(): {index_group[0].size()}')

            # for i, x in enumerate(gt) :
            #     librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_target.wav', x.cpu().numpy(), sr=sample_rate)
            #     audio = out[i][:len(x)].cpu().numpy()
            #     librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate)
            #     audio_tr = out[n_points+i][:len(x)].cpu().numpy()
            #     librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate)

            ######################################
            # Generate atom and group data to save to disk
            index_atom = index_atom.squeeze()
            index_group = index_group.squeeze()
            assert index_atom.size() == index_group.size()
            vqvae_tokens = []
            for i in range(len(index_atom)):
                atom_id = int(index_atom[i])
                group_id = int(index_group[i])
                vqvae_tokens.append(f"{group_id}_{atom_id}")
            vqvae_tokens = '\n'.join(vqvae_tokens)

            ######################################
            # Save files to disk

            # Discrete vqvae symbols
            # for i, x in enumerate(gt):
            # os.makedirs(f'{paths.gen_path()}groups', exist_ok=True)
            filename_noext = f'{filename[0]}'
            with open(f'{paths.gen_path()}vqvae_tokens/{filename_noext}.txt',
                      'w') as f:
                f.write(vqvae_tokens)

            # TODO The ACTUAL embeddings fed into the decoder
            # TODO (average of atoms in group weighted according to their distance from encoder output)
            torch.save(
                discrete,
                f'{paths.gen_path()}decoder_input_vectors/{filename_noext}.pt')

            # discrete vqvae tokens for analysis and modification/pronunciation correction
            # torch.save(index_atom, f'{paths.gen_path()}atoms/{filename_noext}_atom.pt')
            # torch.save(index_group, f'{paths.gen_path()}groups/{filename_noext}_group.pt')
            # TODO currently we are saving the entire matrix of discrete tokens for all utts multiple times
            # TODO need to change this so that we are saving a single vector of discrete tokens for each input test utt
            # TODO create more informative filenames for test generated utts. use original vctk filename and include the speaker that was used to condition the model (create a mapping from one hot speaker id [0-30] to vctk speaker names [pxxx-pzzz] to do this)

            # print(len(index_atom.tolist()))
            # print(len(index_group.tolist()))
            # print(index_atom.tolist())
            # print(index_group.tolist())

            # save wav file for listening
            if out is not None:
                audio_tr = out[0][:self.pad_left_encoder() +
                                  len(gt)].cpu().numpy()
                wav_path = f'{paths.gen_path()}{filename_noext}.wav'
                librosa.output.write_wav(wav_path, audio_tr, sr=sample_rate)
                print(f"Saved audio to {wav_path}")