Beispiel #1
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))

    synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list))
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)

    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(
            wav_path.with_suffix(".wav"),
            sr=cfg.preprocessing.sr)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def encode_dataset(cfg):
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(root_path / "test.json") as file:
        metadata = json.load(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    encoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])

    encoder.eval()

    if cfg.save_auxiliary:
        auxiliary = []

        def hook(module, input, output):
            auxiliary.append(output.clone())

        encoder.encoder[-1].register_forward_hook(hook)

    for _, _, _, path in tqdm(metadata):
        path = root_path.parent / path
        mel = torch.from_numpy(np.load(
            path.with_suffix(".mel.npy"))).unsqueeze(0).to(device)
        with torch.no_grad():
            z, c, indices = encoder.encode(mel)

        z = z.squeeze().cpu().numpy()

        out_path = out_dir / path.stem
        with open(out_path.with_suffix(".txt"), "w") as file:
            np.savetxt(file, z, fmt="%.16f")

        if cfg.save_auxiliary:
            aux_path = out_dir.parent / "auxiliary_embedding1"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            c = c.squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, c, fmt="%.16f")

            aux_path = out_dir.parent / "auxiliary_embedding2"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            aux = auxiliary.pop().squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, aux, fmt="%.16f")
Beispiel #3
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path(
        "datasets")) / cfg.dataset.path  #zerospeech/datasets/2019/english
    with open(dataset_path / "speakers.json") as file:  # 말하는 사람들 이름 써있는 데이터
        speakers = sorted(json.load(file))  # speakers라는 객체로 저장

    synthesis_list_path = Path(utils.to_absolute_path(
        cfg.synthesis_list))  # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(
            file)  # datasets/2019/english에 있는 synthesis.json보면됨

    in_dir = Path(utils.to_absolute_path(
        cfg.in_dir))  # ???임. zerospeech 폴더로 경로따면 될듯. (./)
    out_dir = Path(utils.to_absolute_path(
        cfg.out_dir))  #???임. 목소리 변환된 결과를 저장할 경로
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")  # gpu안되면 cpu로

    encoder = Encoder(
        **cfg.model.encoder)  #ZeroSpeech/config/model/default에 있는 encoder
    decoder = Decoder(
        **cfg.model.decoder)  #ZeroSpeech/config/model/default에 있는 decoder
    encoder.to(device)  # cpu or gpu
    decoder.to(device)  # cpu or gpu

    print("Load checkpoint from: {}:".format(cfg.checkpoint)
          )  ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage
                            )  # checkpoint에 지정된 weight들을 불러옵니다
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    meter = pyloudnorm.Meter(
        cfg.preprocessing.sr
    )  #sr:16000으로 조정??  https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다..

    for wav_path, speaker_id, out_filename in tqdm(
            synthesis_list
    ):  #"english/test/S002_0379088085","V002","V002_0379088085"
        wav_path = in_dir / wav_path  # ./english/test/S002_0379088085
        wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                              sr=cfg.preprocessing.sr)
        ref_loudness = meter.integrated_loudness(wav)  #인풋의 음량을 측정인듯
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(
            device)  #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입

        #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze

        #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/

        speaker = torch.LongTensor([speakers.index(speaker_id)
                                    ]).to(device)  # 마찬가지로 텐서로 만드는데

        #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데,
        #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다.
        #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다.

        # 즉 mel은 소수점있고 speaker는 소숫점 없으니까!
        with torch.no_grad(
        ):  # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        output_loudness = meter.integrated_loudness(output)  #아웃풋의 음량을 측정인듯
        output = pyloudnorm.normalize.loudness(output, output_loudness,
                                               ref_loudness)
        # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경
        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"),
                                 output.astype(np.float32),
                                 sr=cfg.preprocessing.sr)
Beispiel #4
0
def DDF(cfg):

    filter_list_path = Path(utils.to_absolute_path(cfg.filter_list))
    with open(filter_list_path) as file:
        filter_list = json.load(file)
    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)
    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    encoder.eval()
    decoder.eval()
    meter = pyloudnorm.Meter(cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Low":
        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            # librosa.load (it will return audio time series, and its sampling rate)
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            path = out_dir / out_filename

            # to return raw recording in mel-spectrogram without any filtering
            if cfg.output_type == "Embedding":
                mel = librosa.feature.melspectrogram(
                    preemphasis(wav, cfg.preprocessing.preemph),
                    sr=cfg.preprocessing.sr,
                    n_fft=cfg.preprocessing.n_fft,
                    n_mels=cfg.preprocessing.n_mels,
                    hop_length=cfg.preprocessing.hop_length,
                    win_length=cfg.preprocessing.win_length,
                    fmin=cfg.preprocessing.fmin,
                    power=1)
                logmel = librosa.amplitude_to_db(
                    mel, top_db=cfg.preprocessing.top_db)
                logmel = logmel / cfg.preprocessing.top_db + 1
                mel = torch.FloatTensor(logmel).squeeze().to(device).numpy()
                np.savetxt(path.with_suffix(".mel.txt"), mel)

            # to return raw recording in waveform without any filtering
            if cfg.output_type == "Recording":
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         wav.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

    #---------------------------------------
    if cfg.privacy_preference == "Moderate":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    speaker = decoder.speaker(speaker)
                vq = vq.squeeze().to(device).numpy()
                speaker = speaker.squeeze().to(device).numpy()

                np.savetxt(path.with_suffix(".vq.txt"), vq)
                np.savetxt(path.with_suffix(".speaker.txt"), speaker)

    #---------------------------------------
    if cfg.privacy_preference == "High":
        dataset_path = Path(
            utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path
        with open(dataset_path / "speakers.json") as file:
            speakers = sorted(json.load(file))

        for wav_path, speaker_id, out_filename in tqdm(filter_list):
            wav_path = in_dir / wav_path
            wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                                  sr=cfg.preprocessing.sr)
            ref_loudness = meter.integrated_loudness(wav)
            wav = wav / np.abs(wav).max() * 0.999
            mel = librosa.feature.melspectrogram(
                preemphasis(wav, cfg.preprocessing.preemph),
                sr=cfg.preprocessing.sr,
                n_fft=cfg.preprocessing.n_fft,
                n_mels=cfg.preprocessing.n_mels,
                hop_length=cfg.preprocessing.hop_length,
                win_length=cfg.preprocessing.win_length,
                fmin=cfg.preprocessing.fmin,
                power=1)
            logmel = librosa.amplitude_to_db(mel,
                                             top_db=cfg.preprocessing.top_db)
            logmel = logmel / cfg.preprocessing.top_db + 1
            mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
            speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
            path = out_dir / out_filename

            if cfg.output_type == "Recording":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                    output = decoder.generate(vq, speaker)
                output_loudness = meter.integrated_loudness(output)
                output = pyloudnorm.normalize.loudness(output, output_loudness,
                                                       ref_loudness)
                librosa.output.write_wav(path.with_suffix(".wav"),
                                         output.astype(np.float32),
                                         sr=cfg.preprocessing.sr)

            if cfg.output_type == "Embedding":
                with torch.no_grad():
                    vq, _ = encoder.encode(mel)
                vq = vq.squeeze().cpu().numpy()
                np.savetxt(path.with_suffix(".vq.txt"), vq)
Beispiel #5
0
def train_model(cfg):
    tensorboard_path = Path(utils.to_absolute_path("tensorboard")) / cfg.checkpoint_dir
    checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir))
    writer = SummaryWriter(tensorboard_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    vocoder = Vocoder(**cfg.model.vocoder)
    encoder.to(device)
    vocoder.to(device)

    optimizer = optim.Adam(
        vocoder.parameters(),
        lr=cfg.training.optimizer.lr)
    vocoder, optimizer = amp.initialize(vocoder, optimizer, opt_level="O1")
    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=cfg.training.scheduler.milestones,
        gamma=cfg.training.scheduler.gamma)

    if cfg.resume:
        print("Resume checkpoint from: {}:".format(cfg.resume))
        resume_path = utils.to_absolute_path(cfg.resume)
        checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage)
        vocoder.load_state_dict(checkpoint["vocoder"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        amp.load_state_dict(checkpoint["amp"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        global_step = checkpoint["step"]
    else:
        global_step = 0

    print("Resume cpc encoder from: {}:".format(cfg.cpc_checkpoint))
    encoder_path = utils.to_absolute_path(cfg.cpc_checkpoint)
    checkpoint = torch.load(encoder_path, map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    encoder.eval()

    root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    dataset = WavDataset(
        root=root_path,
        hop_length=cfg.preprocessing.hop_length,
        sr=cfg.preprocessing.sr,
        sample_frames=cfg.training.sample_frames)

    dataloader = DataLoader(
        dataset,
        batch_size=cfg.training.batch_size,
        shuffle=True,
        num_workers=cfg.training.n_workers,
        pin_memory=True,
        drop_last=True)

    n_epochs = cfg.training.n_steps // len(dataloader) + 1
    start_epoch = global_step // len(dataloader) + 1

    for epoch in range(start_epoch, n_epochs + 1):
        average_loss = 0

        for i, (audio, mels, speakers) in enumerate(tqdm(dataloader), 1):
            audio, mels, speakers = audio.to(device), mels.to(device), speakers.to(device)

            optimizer.zero_grad()

            with torch.no_grad():
                _, _, indices = encoder.encode(mels)
            output = vocoder(audio[:, :-1], indices, speakers)
            loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:])

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1)
            optimizer.step()
            scheduler.step()

            average_loss += (loss.item() - average_loss) / i

            global_step += 1

            if global_step % cfg.training.checkpoint_interval == 0:
                save_checkpoint(
                    vocoder, optimizer, amp, scheduler, global_step, checkpoint_dir)

        writer.add_scalar("loss/train", average_loss, global_step)

        print("epoch:{}, loss:{:.3E}".format(epoch, average_loss))
Beispiel #6
0
def convert():
    '''
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))
    '''
    dataset_path = Path('./cfg').absolute()
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))
    with open(Path("./cfg/cfg.json").absolute()) as file:
        para = json.load(file)

    synthesis_list_path = Path('./dataset/english/synthesis.txt').absolute()
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)
    in_dir = Path('./dataset/english').absolute()
    out_dir = Path('./output').absolute()
    out_dir.mkdir(exist_ok=True, parents=True)
    print(synthesis_list)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(in_channels=para['encoder']['in_channels'],
                      channels=para['encoder']['channels'],
                      n_embeddings=para['encoder']['n_embeddings'],
                      embedding_dim=para['encoder']['embedding_dim'],
                      jitter=para['encoder']['jitter'])
    decoder = Decoder(
        in_channels=para['decoder']['in_channels'],
        conditioning_channels=para['decoder']['conditioning_channels'],
        n_speakers=para['decoder']['n_speakers'],
        speaker_embedding_dim=para['decoder']['speaker_embedding_dim'],
        mu_embedding_dim=para['decoder']['mu_embedding_dim'],
        rnn_channels=para['decoder']['rnn_channels'],
        fc_channels=para['decoder']['fc_channels'],
        bits=para['decoder']['bits'],
        hop_length=para['decoder']['hop_length'])
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format('./checkpoint/model.pt'))
    checkpoint_path = Path('./checkpoint/model.pt').absolute()
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    #meter = pyloudnorm.Meter(160000)
    print('load finish')
    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(wav_path.with_suffix(".wav"),
                              sr=para['preprocess']['sr'])
        #ref_loudness = meter.integrated_loudness(wav)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, para['preprocess']['preemph']),
            sr=para['preprocess']['sr'],
            n_fft=para['preprocess']['n_fft'],
            n_mels=para['preprocess']['n_mels'],
            hop_length=para['preprocess']['hop_length'],
            win_length=para['preprocess']['win_length'],
            fmin=para['preprocess']['fmin'],
            power=1)
        logmel = librosa.amplitude_to_db(mel,
                                         top_db=para['preprocess']['top_db'])
        logmel = logmel / para['preprocess']['top_db'] + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        #output_loudness = meter.integrated_loudness(output)
        #output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness)
        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"),
                                 output.astype(np.float32),
                                 sr=para['preprocess']['sr'])