def run_inference_on_tier(source, tier, text, timestep):
  # Returns a tuple, (inference, next_tier)
  # inference is the conditional inference on the current tier
  # next_tier interleaves the inference with the input to generate the next tier
  args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  infererence_hp = HParam(args.infer_config)

  assert timestep % t_div[hp.model.tier] == 0, \
      "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep)

  model = MelNet(hp, args, infererence_hp).cuda()
  model.load_tiers()
  model.eval()
  audio_lengths = torch.LongTensor([0]).cuda()
  if tier > 1:
    for t in tqdm(range(model.args.timestep // model.t_div)):
      audio_lengths += 1

  # source = breakdown[tier][0]
  x = torch.unsqueeze(torch.from_numpy(source), 0)
  mu, std, pi = model.tiers[tier](x, audio_lengths)
  temp = sample_gmm(mu, std, pi)
  next_tier = model.tierutil.interleave(x, temp, tier + 1)
  return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
Example #2
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval()

    with torch.no_grad():
        for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))):
            mel = torch.load(melpath)
            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            # pad input mel with zeros to cut artifact
            # see https://github.com/seungwonpark/melgan/issues/8
            zero = torch.full((1, hp.audio.n_mel_channels, 10), -11.5129).cuda()
            mel = torch.cat((mel, zero), axis=2)

            audio = model(mel)
            audio = audio.squeeze() # collapse all dimension except time axis
            audio = audio[:-(hp.audio.hop_length*10)]
            audio = MAX_WAV_VALUE * audio
            audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE)
            audio = audio.short()
            audio = audio.cpu().detach().numpy()

            out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
            write(out_path, hp.audio.sampling_rate, audio)
Example #3
0
def get_checkpoint_loss(chkpt_path):
    config_path = 'config/blizzard.yaml'
    tier = int(
        chkpt_path.split('tier')[1].split('_')[0])  # Yes it's hacky, sorry
    print('tier: %d' % tier)
    checkpoint = torch.load(chkpt_path)
    print("Checkpoint loaded")
    hp = HParam(config_path)
    with open(config_path, 'r') as f:
        model_hp = checkpoint['hp_str']
        hp_str = ''.join(f.readlines())
        if model_hp != hp_str:
            print(model_hp)
            print('')
            print(hp_str)
            print("ERROR: ISSUE WITH DIFFERENT HPs")
    model = get_model(tier, hp)
    print("Got model")
    model.load_state_dict(checkpoint['model'])
    print("Model loaded")
    optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam.lr)
    print("Got optimizer")
    args = get_args(tier)
    print("Got args")
    testloader = get_testloader(hp, args)
    print("Got testloader")
    loss = compute_loss(args, model, testloader, criterion)
    print("Got loss")
    return loss
Example #4
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers,
                        ratios=hp.model.generator_ratio, mult = hp.model.mult,
                        out_band = hp.model.out_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=True)

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        audio = model.inference(mel)

        audio = audio.squeeze(0)  # collapse all dimension except time axis
        if args.d:
            denoiser = Denoiser(model).cuda()
            audio = denoiser(audio, 0.01)
        audio = audio.squeeze()
        audio = audio[:-(hp.audio.hop_length*10)]
        audio = MAX_WAV_VALUE * audio
        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1)
        audio = audio.short()
        audio = audio.cpu().detach().numpy()

        out_path = args.input.replace('.npy', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
        write(out_path, hp.audio.sampling_rate, audio)
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=False)

    with torch.no_grad():
        for melpath in tqdm.tqdm(
                glob.glob(os.path.join(args.input_folder, '*.mel'))):
            mel = torch.load(melpath)
            if len(mel.shape) == 2:
                mel = mel.unsqueeze(0)
            mel = mel.cuda()

            audio = model.inference(mel)
            audio = audio.cpu().detach().numpy()

            out_path = melpath.replace(
                '.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch'])
            write(out_path, hp.audio.sampling_rate, audio)
def main(cmd_args):

    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    args = parser.parse_args(cmd_args)

    hp = HParam(args.config)

    idim = len(valid_symbols)
    odim = hp.audio.num_mels
    model = fs2.FeedForwardTransformer(idim, odim, hp)
    my_script_module = torch.jit.script(model)
    print("Scripting")
    my_script_module.save("{}/{}.pt".format(args.outdir, args.name))
    print("Script done")
    if args.trace:
        print("Tracing")
        model.eval()
        with torch.no_grad():
            my_trace_module = torch.jit.trace(
                model,
                torch.ones(50).to(dtype=torch.int64))
        my_trace_module.save("{}/trace_{}.pt".format(args.outdir, args.name))
        print("Trace Done")
Example #7
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()

    model.load_state_dict(checkpoint['model_g'])
    model.eval()

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        audio = model(mel)
        # For multi-band inference
        print(audio.shape)
        audio = audio.squeeze(0)  # collapse all dimension except time axis
        if args.d:
            denoiser = Denoiser(model).cuda()
            audio = denoiser(audio, 0.1)
        audio = audio.squeeze()
        audio = audio[:-(hp.audio.hop_length * 10)]
        audio = MAX_WAV_VALUE * audio
        audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1)
        audio = audio.short()
        audio = audio.cpu().detach().numpy()

        out_path = args.input.replace(
            '.npy', '_hifi_GAN_epoch%04d.wav' % checkpoint['epoch'])
        write(out_path, hp.audio.sampling_rate, audio)
def load_testset():
  # args = parse_train_args(['-c', './config/blizzard_compressed_experiments.yaml', '-n', 'blizzard_compressed_validation', '-t', str(tier), '-b', '1', '-s', 'TTS'])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  
  dataset = []
  raw_data = None
  with open(os.path.join(hp.data.path, 'prompts.gui'), 'r') as f:
    lines = f.read().splitlines()
    filenames = lines[::3]
    sentences = lines[1::3]
    raw_data = list(zip(filenames, sentences))
  random.seed(123)
  random.shuffle(raw_data)
  raw_data = raw_data[int(0.95 * len(raw_data)):]
  
  for filename, sentence in tqdm(raw_data, total=len(raw_data)):
      wav_path = os.path.join(hp.data.path, 'wavn', filename + '.wav')
      length = get_length(wav_path, hp.audio.sr)
      if length < hp.audio.duration:
          dataset.append((wav_path, sentence))

  for i in range(len(dataset)):
    text = dataset[i][1]
    wav = read_wav_np(dataset[i][0], sample_rate=hp.audio.sr)
    filename = os.path.basename(dataset[i][0])
    yield filename, text, wav
def reconstruct_audio(filename, tier_to_breakdown):
    hp = HParam('./config/blizzard_compressed_experiments.yaml')
    melgen = MelGen(hp)
    tierutil = TierUtil(hp)
    final_reconstruction = None

    # Verify that tier 2 is conditionally generated from just tier 1
    assert (breakdown[2][0] == breakdown[1][1]
            ).all(), "Tier 2 not created from Tier 1"

    for tier in range(2, 7):
        source = tier_to_breakdown[tier][0]
        target = tier_to_breakdown[tier][1]

        source_tensor = torch.unsqueeze(torch.from_numpy(source), 0)
        target_tensor = torch.unsqueeze(torch.from_numpy(target), 0)
        reconstructed_mel_tensor = tierutil.interleave(source_tensor,
                                                       target_tensor, tier + 1)
        reconstructed_mel = reconstructed_mel_tensor.numpy()[0]

        # Verify that interleaving the source and target of the current tier conditionally generates the source of the next tier
        if tier < 6:
            next_tier = tier_to_breakdown[tier + 1][0]
            assert (reconstructed_mel == next_tier).all(
            ), "Tier %d not created from Tier %d" % (tier + 1, tier)
        else:
            final_reconstruction = reconstructed_mel
    print('reconstructing audio...')
    reconstructed_audio = melgen.reconstruct_audio(final_reconstruction)
    melgen.save_audio('reconstructed_' + filename, reconstructed_audio)
Example #10
0
def main(cmd_args):
    """Run training."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    args = parser.parse_args(cmd_args)

    if os.path.exists(args.checkpoint_path):
        checkpoint = torch.load(args.checkpoint_path)
    else:
        print("Checkpoint not exixts")
        return None

    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint["hp_str"])

    validloader = loader.get_tts_dataset(hp.data.data_dir, 1, hp, True)
    print("Checkpoint : ", args.checkpoint_path)

    idim = len(valid_symbols)
    odim = hp.audio.num_mels
    model = FeedForwardTransformer(idim, odim, hp)
    # os.makedirs(args.out, exist_ok=True)
    checkpoint = torch.load(args.checkpoint_path)
    model.load_state_dict(checkpoint["model"])

    evaluate(hp, validloader, model)
Example #11
0
def get_timestep(wav):
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  hop_length = hp.audio.hop_length
  frames = len(wav)
  timestep_goal = float(frames) / float(hop_length)
  final_timestep = 4
  while final_timestep < timestep_goal:
    final_timestep += 4
  return final_timestep-4
Example #12
0
def inference(text, timestep=64):
  args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text])
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  infererence_hp = HParam(args.infer_config)

  assert timestep % t_div[hp.model.tier] == 0, \
      "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep)

  model = MelNet(hp, args, infererence_hp).cuda()
  model.load_tiers()
  model.eval()

  with torch.no_grad():
      # generated = model.sample(args.input)
      breakdown, generated = sample_model_with_breakdown(model, args.input)

  melspec = generated[0].cpu().detach().numpy()
  return breakdown, melspec
Example #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        required=True,
                        help="yaml file for config.")
    parser.add_argument('-p',
                        '--checkpoint_path',
                        type=str,
                        default=None,
                        help="path of checkpoint pt file for resuming")
    parser.add_argument(
        '-n',
        '--name',
        type=str,
        required=True,
        help="Name of the model. Used for both logging and saving chkpt.")
    args = parser.parse_args()

    hp = HParam(args.config)
    hp_str = yaml.dump(hp)
    args_str = yaml.dump(vars(args))

    pt_dir = os.path.join(hp.log.chkpt_dir, args.name)
    log_dir = os.path.join(hp.log.log_dir, args.name)
    os.makedirs(pt_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[
                            logging.FileHandler(
                                os.path.join(
                                    log_dir,
                                    '%s-%d.log' % (args.name, time.time()))),
                            logging.StreamHandler()
                        ])
    logger = logging.getLogger()

    logger.info('Config by yaml file')
    logger.info(hp_str)
    logger.info('Command Line Config')
    logger.info(args_str)

    if hp.data.train == '' or hp.data.test == '':
        logger.error("train or test data directory cannot be empty.")
        raise Exception("Please specify directories of data in %s" %
                        args.config)

    writer = Writer(hp, log_dir)
    train_loader = create_dataloader(hp, args, DataloaderMode.train)
    test_loader = create_dataloader(hp, args, DataloaderMode.test)

    train(args, pt_dir, train_loader, test_loader, writer, logger, hp, hp_str)
def get_audio():
    hp = HParam('./config/blizzard_compressed_experiments.yaml')
    file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension),
                          recursive=True)
    random.seed(123)
    random.shuffle(file_list)
    file_list = file_list[int(0.95 * len(file_list)):]
    for idx in range(len(file_list)):
        filename = os.path.basename(file_list[idx])
        wav = read_wav_np(file_list[idx], sample_rate=hp.audio.sr)
        yield filename, wav
Example #15
0
def deconstruct_audio(wav):
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  melgen = MelGen(hp)
  tierutil = TierUtil(hp)
  mel = melgen.get_normalized_mel(wav)
  tier_to_breakdown = {}
  for tier in range(1, 7):
    source, target = tierutil.cut_divide_tiers(mel, tier)
    print("Tier %d has source dims: %s, target dims %s" % (tier, source.shape, target.shape))
    tier_to_breakdown[tier] = (source, target)
  tier_to_breakdown[7] = (mel, mel)
  return tier_to_breakdown
Example #16
0
def run_inference(source, timestep, tier_to_breakdown):
    # First load in the model
    hp = HParam('./config/blizzard_alldata_v5.yaml')
    infer_hp = HParam('./config/inference.yaml')
    args = parse_inference_args([
        '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml',
        '-t',
        str(timestep), '-n', 'test_tiers', '-i', SENTENCE
    ])
    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()
    audio_lengths = torch.LongTensor([0]).cuda()
    for t in tqdm(range(model.args.timestep // model.t_div)):
        audio_lengths += 1
    ## Tier 2~N ##
    x = torch.unsqueeze(torch.from_numpy(source), 0)
    for tier in tqdm(
            range(model.hp.model.tier + 1 - TESTING_TIERS,
                  model.hp.model.tier + 1)):
        tqdm.write('Tier %d' % tier)
        # Save original source and inference source
        actual_source = tier_to_breakdown[tier][0]
        actual_target = tier_to_breakdown[tier][1]
        actual_interleaved = tier_to_breakdown[tier + 1][0]
        current_source = x
        save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier)
        save_image(actual_source, 'tier_%d_actual_source' % tier)
        save_image(actual_target, 'tier_%d_actual_target' % tier)
        mu, std, pi = model.tiers[tier](x, audio_lengths)
        temp = sample_gmm(mu, std, pi)
        save_image(temp[0].cpu().detach().numpy(),
                   'tier_%d_inference_target' % tier)
        x = model.tierutil.interleave(x, temp, tier + 1)
        save_image(x.detach().numpy()[0],
                   'tier_%d_inference_interleaved' % tier)
        save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier)
    reconstructed_mel_tensor = x.detach().numpy()
    return reconstructed_mel_tensor[0]
def test_fastspeech():
    idim = len(valid_symbols)
    hp = HParam("configs/default.yaml")
    hp.train.ngpu = 0
    odim = hp.audio.num_mels
    model = FeedForwardTransformer(idim, odim, hp)
    x = torch.ones(2, 100).to(dtype=torch.int64)
    input_length = torch.tensor([100, 100])
    y = torch.ones(2, 100, 80)
    out_length = torch.tensor([100, 100])
    dur = torch.ones(2, 100)
    e = torch.ones(2, 100)
    p = torch.ones(2, 100)
    loss, report_dict = model(x, input_length, y, out_length, dur, e, p)
Example #18
0
def init(config, checkpoint_path, device="cuda"):
    checkpoint = torch.load(checkpoint_path)
    if config is not None:
        hp = HParam(config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels,
                      hp.model.n_residual_layers,
                      ratios=hp.model.generator_ratio,
                      mult=hp.model.mult,
                      out_band=hp.model.out_channels).to(device)
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=True)
    return hp, model
Example #19
0
def main(args):
    args = {
        "config": 'config/config.yaml',
        "embedder_path": 'model/embedder.pt',
        "checkpoint_path": 'enhance_my_voice/chkpt_201000.pt',
        "mixed_file": 'utils/speakerA.wav',
        "reference_file": 'utils/speakerA.wav',
        "out_dir": 'output',
    }

    hp = HParam(args['config'])

    with torch.no_grad():
        model = VoiceFilter(hp).cuda()
        chkpt_model = torch.load(args['checkpoint_path'])['model']
        model.load_state_dict(chkpt_model)
        model.eval()

        embedder = SpeechEmbedder(hp).cuda()
        chkpt_embed = torch.load(args['embedder_path'])
        embedder.load_state_dict(chkpt_embed)
        embedder.eval()

        audio = Audio(hp)
        dvec_wav, _ = librosa.load(args['reference_file'], sr=16000)
        dvec_mel = audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
        dvec = embedder(dvec_mel)
        dvec = dvec.unsqueeze(0)

        mixed_wav, _ = librosa.load(args['mixed_file'], sr=16000)
        mag, phase = audio.wav2spec(mixed_wav)
        mag = torch.from_numpy(mag).float().cuda()

        mag = mag.unsqueeze(0)
        mask = model(mag, dvec)
        est_mag = mag * mask

        est_mag = est_mag[0].cpu().detach().numpy()
        # est_wav = audio.spec2wav(est_mag, phase)

        # os.makedirs(args['out_dir'], exist_ok=True)
        # out_path = os.path.join(args['out_dir'], 'result.wav')
        # librosa.output.write_wav(out_path, est_wav, sr=16000)
        return audio.spec2wav(est_mag, phase)
def main(cmd_args):
    """Run training."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)

    args = parser.parse_args(cmd_args)

    hp = HParam(args.config)
    with open(args.config, "r") as f:
        hp_str = "".join(f.readlines())

    # logging info
    os.makedirs(hp.train.log_dir, exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(
                os.path.join(hp.train.log_dir,
                             "%s-%d.log" % (args.name, time.time()))),
            logging.StreamHandler(),
        ],
    )
    logger = logging.getLogger()

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    ngpu = hp.train.ngpu
    logger.info(f"ngpu: {ngpu}")

    # set random seed
    logger.info("random seed = %d" % hp.train.seed)
    random.seed(hp.train.seed)
    np.random.seed(hp.train.seed)

    vocoder = torch.hub.load("seungwonpark/melgan",
                             "melgan")  # load the vocoder for validation

    if hp.train.GTA:
        create_gta(args, hp, hp_str, logger)
    else:
        train(args, hp, hp_str, logger, vocoder)
Example #21
0
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers,
                        ratios=hp.model.generator_ratio, mult = hp.model.mult,
                        out_band = hp.model.out_channels).cuda()
    model.load_state_dict(checkpoint['model_g'])
    model.eval(inference=True)

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        zero = torch.full((1, 80, 10), -11.5129).to(mel.device)
        mel = torch.cat((mel, zero), dim=2)
        vocgan_trace = torch.jit.trace(model, mel)
        vocgan_trace.save("{}/vocgan_ex_female_en_{}_{}.pt".format(args.out, checkpoint['githash'], checkpoint['epoch']))
def main(args):
    checkpoint = torch.load(args.checkpoint_path)
    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint['hp_str'])

    model = Generator(hp.audio.n_mel_channels).cuda()

    model.load_state_dict(checkpoint['model_g'])
    model.eval()
    #model.remove_weight_norm()

    with torch.no_grad():
        mel = torch.from_numpy(np.load(args.input))
        if len(mel.shape) == 2:
            mel = mel.unsqueeze(0)
        mel = mel.cuda()
        #zero = torch.full((1, 80, 10), -11.5129).to(mel.device)
        #mel = torch.cat((mel, zero), dim=2)
        hifigan_trace = torch.jit.trace(model, mel)
        #print(state_dict_g.keys())
        hifigan_trace.save("{}/hifigan_{}.pt".format(args.out, args.name))
Example #23
0
def main(args):
    """Run deocding."""
    para_mel = []
    parser = get_parser()
    args = parser.parse_args(args)

    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    print("Text : ", args.text)
    print("Checkpoint : ", args.checkpoint_path)
    if os.path.exists(args.checkpoint_path):
        checkpoint = torch.load(args.checkpoint_path)
    else:
        logging.info("Checkpoint not exixts")
        return None

    if args.config is not None:
        hp = HParam(args.config)
    else:
        hp = load_hparam_str(checkpoint["hp_str"])

    idim = len(valid_symbols)
    odim = hp.audio.num_mels
    model = FeedForwardTransformer(
        idim, odim, hp)  # torch.jit.load("./etc/fastspeech_scrip_new.pt")

    os.makedirs(args.out, exist_ok=True)
    if args.old_model:
        logging.info("\nSynthesis Session...\n")
        model.load_state_dict(checkpoint, strict=False)
    else:
        checkpoint = torch.load(args.checkpoint_path)
        model.load_state_dict(checkpoint["model"])

    text = process_paragraph(args.text)

    for i in range(0, len(text)):
        txt = preprocess(text[i])
        audio = synth(txt, model, hp)
        m = audio.T
        para_mel.append(m)

    m = torch.cat(para_mel, dim=1)
    np.save("mel.npy", m.cpu().numpy())
    plot_mel(m)

    if hp.train.melgan_vocoder:
        m = m.unsqueeze(0)
        print("Mel shape: ", m.shape)
        vocoder = torch.hub.load("seungwonpark/melgan", "melgan")
        vocoder.eval()
        if torch.cuda.is_available():
            vocoder = vocoder.cuda()
            mel = m.cuda()

        with torch.no_grad():
            wav = vocoder.inference(
                mel)  # mel ---> batch, num_mels, frames [1, 80, 234]
            wav = wav.cpu().float().numpy()
    else:
        stft = STFT(filter_length=1024, hop_length=256, win_length=1024)
        print(m.size())
        m = m.unsqueeze(0)
        wav = griffin_lim(m, stft, 30)
        wav = wav.cpu().numpy()
    save_path = "{}/test_tts.wav".format(args.out)
    write(save_path, hp.audio.sample_rate, wav.astype("int16"))
Example #24
0
import numpy as np
import os
from utils.util import get_files
from tqdm import tqdm
from utils.util import remove_outlier
from utils.hparams import HParam

if __name__ == "__main__":

    hp = HParam("./configs/default.yaml")

    min_e = []
    min_p = []
    max_e = []
    max_p = []
    nz_min_p = []
    nz_min_e = []

    energy_path = os.path.join(hp.data.data_dir, "energy")
    pitch_path = os.path.join(hp.data.data_dir, "pitch")
    mel_path = os.path.join(hp.data.data_dir, "mels")
    energy_files = get_files(energy_path, extension=".npy")
    pitch_files = get_files(pitch_path, extension=".npy")
    mel_files = get_files(mel_path, extension=".npy")

    assert len(energy_files) == len(pitch_files) == len(mel_files)

    energy_vecs = []
    for f in tqdm(energy_files):
        e = np.load(f)
        e = remove_outlier(e)
Example #25
0
  #   inference_breakdown[i][0]
  #   save_image('tier%d_inferred_breakdown_%s.png' % (i, filename), inference_breakdown[i][0])
  # save_image('final_inferred_%s.png' % filename, inferred)

  tier = 5
  source = breakdown[tier][0]
  print("Source tier 5 shape: %s" % str(source.shape))
  save_image('source_tier_%d_%s.png' % (tier, filename), breakdown[tier][0])
  inferred_source_6, inferred_5 = run_inference_on_tier(source, tier, text, timestep)
  print("inferred tier 5 target shape: %s" % str(inferred_5.shape))
  print("inferred tier 6 source shape: %s" % str(inferred_source_6.shape))
  tier = 6
  inferred_final, inferred_6 = run_inference_on_tier(inferred_source_6, tier, text, timestep)
  print("inferred tier 6 target shape: %s" % str(inferred_6.shape))
  print("inferred final shape: %s" % str(inferred_final.shape))
  print("original final shape: %s" % str(breakdown[tier+1][0].shape))
  save_image('target_tier_%d_%s.png' % (tier, filename), breakdown[tier][1])
  save_image('next_tier_%d_%s.png' % (tier, filename), breakdown[tier+1][0])
  save_image('inferred_tier_%d_%s.png' % (tier, filename), inferred_6)
  save_image('inferred_next_tier_%d_%s.png' % (tier, filename), inferred_final)

  # Save the actual audio
  hp = HParam('./config/blizzard_compressed_experiments.yaml')
  melgen = MelGen(hp)
  source_wav = melgen.reconstruct_audio(breakdown[tier+1][0])
  inference_wav = melgen.reconstruct_audio(inferred_final)
  melgen.save_audio('source_'+filename, source_wav)
  melgen.save_audio('inference_'+filename, inference_wav)

  break
  
Example #26
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str, required=True,
                        help="yaml file for configuration")
    parser.add_argument('-p', '--infer_config', type=str, required=True,
                        help="yaml file for inference configuration")
    parser.add_argument('-t', '--timestep', type=int, default=240,
                        help="timestep of mel-spectrogram to generate")
    parser.add_argument('-n', '--name', type=str, default="result", required=False,
                        help="Name for sample")
    parser.add_argument('-i', '--input', type=str, default=None, required=False,
                        help="Input for conditional generation, leave empty for unconditional")
    args = parser.parse_args()

    hp = HParam(args.config)
    infer_hp = HParam(args.infer_config)

    assert args.timestep % t_div[hp.model.tier] == 0, \
        "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep)

    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    with torch.no_grad():
        generated = model.sample(args.input)

    os.makedirs('temp', exist_ok=True)
    torch.save(generated, os.path.join('temp', args.name + '.pt'))
    spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy())
Example #27
0
from helpers.processor import Processor
from datasets.dataset import SpeechDataset

parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--config',
                    default=None,
                    type=str,
                    help='Config file path')
parser.add_argument('--compute',
                    action='store_true',
                    help='Pre-compute dataset statistics')
args = parser.parse_args()

hparams = HParam(args.config) \
        if args.config else HParam(osp.join(osp.abspath(os.getcwd()), 'config', 'default.yaml'))

datasets_path = hparams.data.datasets_path
dataset_file_url = \
    f'https://open-speech-data.oss-cn-hangzhou.aliyuncs.com/{hparams.data.dataset_dir}.tar.bz2'
dataset_file_name = osp.basename(dataset_file_url)
dataset_dir = dataset_file_name[:-8]
dataset_path = osp.join(datasets_path, dataset_dir)
wavfile_path = osp.join(dataset_path, "wavs")
melspec_path = osp.join(dataset_path, "mels")

if osp.isdir(melspec_path) and False:
    print("%s dataset folder already exists" % dataset_dir)
    sys.exit(0)
else:
Example #28
0
                        help="yaml file for configuration")
    parser.add_argument('-p',
                        '--checkpoint_path',
                        type=str,
                        default=None,
                        help="path of checkpoint pt file to resume training")
    parser.add_argument(
        '-n',
        '--name',
        type=str,
        required=True,
        help="name of the model for logging, saving checkpoint")
    #argv = ['-c', './config/mb_melgan.yaml', '-n', 'melgan-male', '-p', './checkpoints/mb_melgan_901be72_0600.pt']
    args = parser.parse_args()

    hp = HParam(args.config)
    with open(args.config, 'r') as f:
        hp_str = ''.join(f.readlines())

    pt_dir = os.path.join(hp.log.chkpt_dir, args.name)
    log_dir = os.path.join(hp.log.log_dir, args.name)
    os.makedirs(pt_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s',
                        handlers=[
                            logging.FileHandler(
                                os.path.join(
                                    log_dir,
                                    '%s-%d.log' % (args.name, time.time()))),
Example #29
0
                        help="Directory of VoxCeleb2 dataset, ends with 'aac'")
    parser.add_argument('-cu', '--current_corpus_dir', type=str, default=None,
                        help="Directory of currentCorpus dataset")
    parser.add_argument('-o', '--out_dir', type=str, required=True,
                        help="Directory of output training triplet")
    parser.add_argument('-p', '--process_num', type=int, default=None,
                        help='number of processes to run. default: cpu_count')
    parser.add_argument('--vad', type=int, default=0,
                        help='apply vad to wav file. yes(1) or no(0, default)')
    args = parser.parse_args()

    os.makedirs(args.out_dir, exist_ok=True)    # Creates output directory
    os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True) # Creates train output directory
    os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True)  # Creates test output dorectory

    hp = HParam(args.config)  # hp contains the informations of config.yaml

    cpu_num = cpu_count() if args.process_num is None else args.process_num

    if args.libri_dir is None and args.voxceleb_dir is None and args.current_corpus_dir is None:
        raise Exception("Please provide directory of data")

    if args.libri_dir is not None:
        # train_folders = all subfolders of train-clean-100
        train_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-100', '*')) 
                            if os.path.isdir(x)] + \
                        [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-360', '*'))
                            if os.path.isdir(x)]
                        # + \
                        #[x for x in glob.glob(os.path.join(args.libri_dir, 'train-other-500', '*'))
                        #    if os.path.isdir(x)]
Example #30
0
                        type=str,
                        help="Append to logdir name")
    parser.add_argument("--config",
                        default=None,
                        type=str,
                        help="Config file path")
    args = parser.parse_args()

    if torch.cuda.is_available():
        index = args.device if args.device else str(
            0 if gm is None else gm.auto_choice())
    else:
        index = 'cpu'
    device = select_device(index)

    hparams = HParam(args.config) \
        if args.config else HParam(osp.join(osp.abspath(os.getcwd()), "config", "default.yaml"))

    logdir = osp.join(hparams.trainer.logdir,
                      f"%s-%s" % (hparams.data.dataset, args.name))
    checkpoint = args.checkpoint or get_last_chkpt_path(logdir)

    normalizer = StandardNorm(hparams.audio.spec_mean, hparams.audio.spec_std)
    processor = TextProcessor(hparams.text)
    text2mel = ParallelText2Mel(hparams.parallel)
    text2mel.eval()

    synthesizer = Synthesizer(model=text2mel,
                              checkpoint=checkpoint,
                              processor=processor,
                              normalizer=normalizer,