Beispiel #1
0
def ppg_windowed(example_wav, hop_ms):
    from features import get_ppg
    ppg = get_ppg(example_wav['wav'],
                  example_wav['sr'],
                  backend='wav2vec2',
                  max_window=1.0,
                  overlap=0.1)
    return ppg
Beispiel #2
0
def main(cfg: DictConfig):
    """Preprocessing function for DAPS-like dataset (https://archive.org/details/daps_dataset).
  - Extracts features (PPG, f0, loudness, spk embedding) for every wav in datafolder
  - Builds train and test list
  """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(OmegaConf.to_yaml(cfg.dataset))
    print(OmegaConf.to_yaml(cfg.data))
    print('device:', device)

    # folders preparation
    for out_dir_path in [
            cfg.dataset.f0_dir, cfg.dataset.ppg_dir, cfg.dataset.loudness_dir
    ]:
        out_dir_path = Path(out_dir_path)
        if out_dir_path.exists():
            assert out_dir_path.is_dir()
        else:
            out_dir_path.mkdir(parents=True)

    # preprocess dataset and loader
    filepathes = get_datafolder_files(cfg.dataset.wav_dir)
    dataset = PreprocessDataset(filepathes, cfg.data)
    dataloader = DataLoader(dataset,
                            batch_size=1,
                            shuffle=False,
                            drop_last=False,
                            num_workers=cfg.train.num_workers)
    spk_embs = dict()

    # feature extraction
    pbar = tqdm.tqdm(total=len(dataset), ncols=0)
    for wav, filename in dataloader:
        # batch size is 1
        wav = denoise(wav[0].numpy(), device=device)
        filename = Path(filename[0])

        with torch.no_grad():
            ppg = torch.from_numpy(
                get_ppg(wav, cfg.data.sample_rate, device=device))
            f0 = torch.from_numpy(
                get_f0(wav,
                       cfg.data.sample_rate,
                       hop_ms=cfg.data.hop_length * 1000 /
                       cfg.data.sample_rate,
                       f_min=cfg.data.f_min,
                       f_max=cfg.data.f_max))
            loudness = torch.from_numpy(
                get_loudness(wav,
                             cfg.data.sample_rate,
                             n_fft=cfg.data.n_fft,
                             hop_length=cfg.data.hop_length,
                             win_length=cfg.data.win_length))
            spk_emb = torch.from_numpy(
                get_speaker_embed(wav, cfg.data.sample_rate,
                                  device=device)).cpu()

        torch.save(
            ppg, os.path.join(cfg.dataset.ppg_dir,
                              filename.with_suffix('.pt')))
        torch.save(
            f0, os.path.join(cfg.dataset.f0_dir, filename.with_suffix('.pt')))
        torch.save(
            loudness,
            os.path.join(cfg.dataset.loudness_dir,
                         filename.with_suffix('.pt')))
        spk_embs[filename.stem] = spk_emb

        pbar.update(dataloader.batch_size)

    torch.save(spk_embs, cfg.dataset.spk_embs_file)

    # generation of train and test files
    train_list = define_train_list(filepathes)
    train_list, test_list = train_test_split(train_list)
    save_dataset_filelist(train_list, cfg.dataset.train_list)
    save_dataset_filelist(test_list, cfg.dataset.test_list)
Beispiel #3
0
def ppg(example_wav, hop_ms):
    from features import get_ppg
    ppg = get_ppg(example_wav['wav'], example_wav['sr'], backend='wav2vec2')
    return ppg