Beispiel #1
0
def init_model():
    print("init model!!!!")
    global tacotron2_model
    global waveglow_model
    global denoiser

    tacotron2_path = "outdir_finetune/checkpoint_62500"
    #    tacotron2_path = "outdir_korean/checkpoint_8800"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000"
    #    tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000"
    #    tacotron2_path = "outdir_lj_korean/checkpoint_5000"
    #    tacotron2_path = "outdir_longtrain/checkpoint_439500"
    waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000"
    #   waveglow_path = "../waveglow/checkpoints/waveglow_335000"
    # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000"
    sampling_rate = 22050
    denoiser_strength = 0.0
    hparams = create_hparams()
    hparams.sampling_rate = sampling_rate
    hparams.training = False

    tacotron2_model = load_model(hparams)
    tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
    _ = tacotron2_model.cuda().eval().half()

    # with open("waveglow/config.json") as f:
    #     data = f.read()
    # import json
    # config = json.loads(data)
    # waveglow_config = config["waveglow_config"]
    #
    # waveglow_model = glow.WaveGlow(**waveglow_config)
    #
    # checkpoint_dict = torch.load(waveglow_path, map_location='cpu')
    # model_for_loading = checkpoint_dict['model']
    # waveglow_model.load_state_dict(model_for_loading.state_dict())
    #
    # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict'])
    # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    # waveglow_model.cuda().eval().half()

    waveglow_model = torch.load(waveglow_path)['model']
    waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    waveglow_model.cuda().eval().half()
    for k in waveglow_model.convinv:
        k.float()
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow_model)
Beispiel #2
0
def test_overlay_first_samples():
    hparams = create_hparams()

    # test 100 random combinations
    for _ in range(100):
        hparams.batch_factor = random.randint(1, 32)
        hparams.horizon = random.randint(1, 10)
        subscale = Subscaler(hparams)
        batch_dim = random.randint(1, 2)

        lensrc = subscale.context_len * random.randint(1, 10)
        indeces = torch.arange(lensrc).repeat(batch_dim, 1)
        pos = random.randint(0, 10)

        run_overlay(subscale, indeces, pos)
 def tacotron2_init(self):
     self.plot_wav_data = False
     # set parameters
     self.hparams = create_hparams()
     self.hparams.sampling_rate = 22050
     # load tacotron2
     self.model = load_model(self.hparams)
     self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict'])
     _ = self.model.cuda().eval().half()
     # load waveglow
     self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model']
     self.waveglow.cuda().eval().half()
     for k in self.waveglow.convinv:
         k.float()
     self.denoiser = Denoiser(self.waveglow)
Beispiel #4
0
def load_tts_vocoder_models(tacotron_checkpoint_path,
                            waveglow_checkpoint_path):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
    _ = model.cuda().eval()

    waveglow = torch.load(waveglow_checkpoint_path)['model']
    waveglow.cuda().eval()
    #for k in waveglow.convinv:
    #    k.float()
    denoiser = Denoiser(waveglow)
    return model, waveglow, denoiser, hparams
Beispiel #5
0
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    hparams = create_hparams()
    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    testset = TextMelLoader(text_files, hparams)
    collate_fn = TextMelCollate()

    test_loader = DataLoader(testset,
                             num_workers=0,
                             shuffle=False,
                             sampler=None,
                             batch_size=1,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=collate_fn)
    waveglow = torch.load(waveglow_path)['model']
    # waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, batch in enumerate(test_loader):
        text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
            batch)
        enc_outputs, _ = Taco2(
            (text_padded, input_lengths, mel_padded, max_len, output_lengths))
        # mel = torch.autograd.Variable(mel.cuda())
        # mel = torch.unsqueeze(mel, 0)
        # mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma)
            '''if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE'''
        # audio = audio.squeeze()
        # mel = mel.cpu().numpy()
        # audio = audio.astype('int16')
        print(mel)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        print(mel_path)
Beispiel #6
0
def test_stack_flatten_parity():
    hparams = create_hparams()
    for _ in range(100):
        hparams.batch_factor = random.randint(1, 32)
        hparams.horizon = random.randint(1, 10)
        seq_len = random.randint(1, 10)
        n_channels = random.randint(1, 1000)
        subscale = Subscaler(hparams)

        batch_dim = random.randint(1, 16)
        tensor = torch.rand(
            [batch_dim, seq_len * subscale.context_len, n_channels])

        permuted = subscale.stack_substensors(tensor)
        orig = subscale.flatten_subtensors(permuted)
        assert (torch.eq(tensor, orig).all())
Beispiel #7
0
def main(_):
    fh = FieldHandler(train_file_path=FLAGS.train_file_path,
                      category_columns=FLAGS.category_columns,
                      continuation_columns=FLAGS.continuation_columns)

    features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label)

    # features, labels, files_dict = dataGenerate(FLAGS.train_file_path)
    hparams = create_hparams(fh.field_nums, fh.feature_nums)

    train_input_fn = create_train_input_fn(features,
                                           label=labels,
                                           batch_size=hparams.batch_size,
                                           num_epochs=hparams.epoches)

    
    if hparams.model == "fm":
        model_fn = create_model_fn(FM)
    elif hparams.model == "ffm":
        if hparams.use_deep:
            tf.logging.warning("\n\n>>>>>>>>>>> use ffm model, ignore --use_deep params. <<<<<<<<<<<<<<<\n")
        model_fn = create_model_fn(FFM)
    else:
        raise ValueError("model is ffm or fm.")
    
    estimator = tf.estimator.Estimator(
        model_fn = model_fn,
        model_dir=FLAGS.model_path,
        params=hparams,
        config=tf.estimator.RunConfig(
            tf_random_seed=hparams.seed,
            log_step_count_steps=500
        )
    )

    show_dict = {
        "loss":"loss",
        "accuracy":"accuracy/value",
        "auc":"auc/value"
    }
   
    log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100)
    # estimator.train(input_fn=train_input_fn, hooks=[log_hook])

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=train_input_fn, )
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Beispiel #8
0
def main():
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = get_Tacotron2(hparams)
    waveglow = get_WaveGlow()

    # text = "Waveglow is really awesome!"
    texts = [
        "PRIH1NTIH0NG , IH0N TH AO1NLIY0 SEH1NS WIH1TH HHWIH1CH W AA1R AE1T PRIY0ZEH1NT KAH0NSER1ND , DIH1FER0Z FRAH1M MOW2ST IH1F NAA1T FRAH1M AH0L TH AA1RTS AE1ND KRAE1FTS REH2PRIH0ZEH1NTIH0D IH0N TH EH2KSAH0BIH1SHAH0N",
        "AE1ND DIH0TEY1LIH0NG PAH0LIY1S IH0N SAH0VIH1LYAH0N KLOW1DHZ TOW0 B SKAE1TER0D THRUW0AW1T TH SAY1ZAH0BAH0L KRAW1D .",
        "AY1 LAH1V YUW1 VEH1RIY0 MAH1CH",
        "SAY1AH0NTIH0STS AE1T TH SER1N LAE1BRAH0TAO2RIY0 SEY1 DHEY1 HHAE1V DIH0SKAH1VER0D AH0 NUW1 PAA1RTAH0KAH0L .",
        "PREH1ZIH0DAH0NT TRAH1MP MEH1T WIH1TH AH1DHER0 LIY1DER0Z AE1T TH GRUW1P AH1V TWEH1NTIY0 KAA1NFER0AH0NS .",
        "LEH1TS GOW1 AW2T TOW0 TH EH1RPAO2RT . TH PLEY1N LAE1NDAH0D TEH1N MIH1NAH0TS AH0GOW2 .",
        "IH0N BIY1IH0NG KAH0MPEH1RAH0TIH0VLIY0 MAA1DER0N .", "VIH1PKIH0D",
        "VIH1P KIH0D"
    ]

    if not os.path.exists("results"):
        os.mkdir("results")

    for text in texts:
        sequence = np.array(text_to_sequence(text,
                                             ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).cuda().long()

        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
            sequence)
        plot_data((mel_outputs.float().data.cpu().numpy()[0],
                   mel_outputs_postnet.float().data.cpu().numpy()[0],
                   alignments.float().data.cpu().numpy()[0].T), text[:10])

        #print("mel_out:", mel_outputs)
        #print("mel_out_postnet:", mel_outputs_postnet)
        #print("alignments:", alignments)

        with torch.no_grad():
            audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
            audio = audio * hparams.max_wav_value
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        write("results/{}_synthesis.wav".format(text), hparams.sampling_rate,
              audio)
        print("complete:", text)
Beispiel #9
0
def main():
    parse = argparse.ArgumentParser()
    parse.add_argument(
        '-f',
        '--feature_file',
        type=str,
        default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32',
        help='features file to train')
    parse.add_argument(
        '-o',
        '--out_file',
        type=str,
        default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32.s16',
        help='features file to train')
    args = parse.parse_args()
    hparams = create_hparams()
    synthesis(args, hparams)
Beispiel #10
0
def load_latest_model_from(location):

    files = [location + "/" + f for f in os.listdir(location)]
    newest_file = max(files, key=os.path.getctime)

    print("load model " + newest_file)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    hparam = hparams.create_hparams()

    model = WaveNetModel(hparam, device).to(device)
    if torch.cuda.is_available():
        states = torch.load(newest_file)
    else:
        states = torch.load(newest_file, map_location='cpu')
    model.load_state_dict(states['state_dict'])
    return model
def load_mel(path):
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cpu()
    return melspec
Beispiel #12
0
 def __init__(self, ckpt, wglw, n_speakers=123):
     print("[Loading Model]")
     self.ckpt = ckpt
     self.hparams = create_hparams()
     self.hparams.n_speakers = n_speakers
     self.stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              self.hparams.n_mel_channels,
                              self.hparams.sampling_rate,
                              self.hparams.mel_fmin, self.hparams.mel_fmax)
     self.mellotron = load_model(self.hparams).cuda().eval()
     self.waveglow = torch.load(wglw)['model'].cuda().eval()
     self.denoiser = Denoiser(self.waveglow).cuda().eval()
     self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
     self.mellotron.load_state_dict(torch.load(ckpt)['state_dict'])
     print('[Loaded Model]')
Beispiel #13
0
def main(unused_arg):
    model_fn = model.create_model_fn(hp.create_hparams())

    estimator = tf.contrib.learn.Estimator(model_fn=model_fn,
                                           model_dir=FLAGS.model_dir,
                                           config=tf.contrib.learn.RunConfig())

    input_fn = input.create_input_fn([TEST_FILE_PATH],
                                     tf.crontrib.learn.ModeKeys.EVAL,
                                     FLAGS.test_batch_size, 1)

    eval_metrics = metrics.create_evaluation_metrics()

    estimator.evaluate(input_fn=input_fn,
                       batch_size=FLAGS.test_batch_size,
                       metrics=eval_metrics,
                       steps=None)
Beispiel #14
0
def main(text, checkpoint_path, path, name):
    #### Setup hparams
    hparams = create_hparams("distributed_run=False,mask_padding=False")
    hparams.filter_length = 1024
    hparams.hop_length = 256
    hparams.win_length = 1024

    #### Load model from checkpoint
    model = get_model(hparams, checkpoint_path)

    #### Prepare text input
    sequence = get_input(get_pinyin(text))

    #### inference
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, drop_prob=0.25)

    #### tacotron result
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, 60)
    write(
        os.path.join(path, name) + '_tacotron.wav', 16000,
        waveform[0].data.cpu().numpy())

    #### transform tacotron mel to wavenet mel
    wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T)

    #### save
    np.save(
        os.path.join(path, name) + '_mel.npy',
        mel_outputs_postnet.data.cpu().numpy()[0])
    np.save(
        os.path.join(path, name) + '_alig.npy',
        alignments.data.cpu().numpy()[0])
    np.save(os.path.join(path, name) + '.npy', wavenet_mel)
Beispiel #15
0
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    index = 1

    tacotron_model = model(create_hparams(), './output/checkpoint_20500')

    with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
        for line in tqdm(f):
            parts = line.strip().split('|')
            wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
            text = parts[2]
            futures.append(
                executor.submit(
                    partial(_process_utterance, out_dir, index, wav_path, text,
                            tacotron_model)))
            index += 1
    return [future.result() for future in tqdm(futures)]
    def __init__(self, lang):
        self.language = lang
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        with open('config.json', 'r') as f:
            self.config = json.load(f)

        self.waveglow_path = self.config.get('model').get('waveglow')
        self.waveglow = torch.load(self.waveglow_path)['model']
        self.waveglow.cuda().eval().half()

        for m in self.waveglow.modules():
            if 'Conv' in str(type(m)):
                setattr(m, 'padding_mode', 'zeros')
                
        for k in self.waveglow.convinv:
            k.float()
        self.denoiser = Denoiser(self.waveglow)
        self.update_model(lang)
Beispiel #17
0
def getAudio(text):
    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    waveglow_path = os.path.join(__location__, 'waveglow_256channel.pt')
    waveglow = torch.load(waveglow_path,map_location='cpu')['model']
    waveglow.cpu().eval()
    for k in waveglow.convinv:
        k.float()
    #denoiser = Denoiser(waveglow)

    checkpoint_path = os.path.join(__location__, "checkpoint_9000")
    model = load_model(hparams)
    #print(model)
    state = torch.load(checkpoint_path,map_location='cpu')['state_dict']
    #print(state)
    model.load_state_dict(state)
    _ = model.cpu().eval()

    #text = "Bộ Y tế chỉ đạo Viện Vệ sinh dịch tễ và các địa phương điều tra dịch tễ các trường hợp có kết quả xét nghiệm dương tính, xác minh người tiếp xúc gần với bệnh nhân dương tính, khoanh vùng xử lý ổ dịch và cách ly theo dõi sức khỏe những người tiếp xúc."
    text = TTSnorm(text)
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()



    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    #plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T))


    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    #########
    #Thieu phan denoiser do phai chay tren gpu
    #########
        
    
    text_hashed=abs(hash(text)) % (10 ** 8)
    sd.write("static/audio/"+str(text_hashed)+'.wav',audio[0].data.cpu().numpy(), 22050)
    return text
Beispiel #18
0
    def __init__(self, model_choice):
        self.model_choice = model_choice
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        with open('config.json', 'r') as f:
            self.config = json.load(f)
        self.max_duration_s = self.config.get('max_duration_s')
        self.hparams.max_decoder_steps = int(86.0 * self.max_duration_s)

        self.waveglow = torch.load('models/waveglow',
                                   map_location=torch.device('cpu'))['model']
        self.waveglow.eval()

        for m in self.waveglow.modules():
            if 'Conv' in str(type(m)):
                setattr(m, 'padding_mode', 'zeros')

        for k in self.waveglow.convinv:
            k.float()
        #self.denoiser = Denoiser(self.waveglow)
        self.update_model(model_choice, self.max_duration_s)
Beispiel #19
0
    def load(self, tacotron_model, waveglow_model):
        # setting
        self.project_name = 'tacotron2'
        sys.path.append(self.project_name)
        sys.path.append(join(self.project_name, 'waveglow/'))

        # initialize Tacotron2
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        self.hparams.max_decoder_steps = 1000
        self.hparams.fp16_run = True

        self.tacotron = Tacotron2(self.hparams)
        self.tacotron.load_state_dict(torch.load(tacotron_model)['state_dict'])
        _ = self.tacotron.cuda().eval()

        self.waveglow = torch.load(waveglow_model)['model']
        self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
        _ = self.waveglow.cuda().eval()
        for k in self.waveglow.convinv:
            k.float()
Beispiel #20
0
def synth(models, text, out):
    hparams = create_hparams()

    checkpoint_path = models + '/tacotron2'
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.eval()

    waveglow_path = models + '/waveglow'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda()

    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    with torch.no_grad():
        audio = 32768.0 * waveglow.infer(mel_outputs_postnet, sigma=0.666)[0]

    audio = audio.cpu().numpy()
    audio = audio.astype('int16')
    write(out, 8000, audio)
Beispiel #21
0
def infer(checkpoint_path, griffin_iters, text, out_filename):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval()  #.half()

    sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                        taco_stft.stft_fn, griffin_iters)

    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    #audio = audio.astype('int16')
    audio_path = os.path.join('samples',
                              "{}_synthesis.wav".format(out_filename))
    write(audio_path, hparams.sampling_rate, audio)
    print(audio_path)
    plot_alignment_to_numpy(
        alignments.squeeze().cpu().detach().numpy().T,
        os.path.join('samples', "{}_attention.png".format(out_filename)))
Beispiel #22
0
def main(argv):
    args = utils.parse_args("Train a transformer model")
    utils.redirect_log_to_file(args.model_dir)

    hparams = create_hparams(args.model_dir, args.configs, initialize=True)
    utils.check_git_hash(args.model_dir)

    # Prepare data
    data.load_vocab(hparams)

    train_input_fn = data.InputPipeline(None, None, hparams.record_train_file,
                                        tf.estimator.ModeKeys.TRAIN, hparams)
    eval_input_fn = data.InputPipeline(None, None, hparams.record_eval_file,
                                       tf.estimator.ModeKeys.EVAL, hparams)

    # Training
    log_samples_hook = tf.train.LoggingTensorHook(
        ['targets', 'predictions'],
        at_end=True,
        formatter=tensors_to_string(hparams))

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=hparams.train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      steps=hparams.eval_steps,
                                      hooks=[log_samples_hook])

    distribution = tf.contrib.distribute.MirroredStrategy()
    run_config = tf.estimator.RunConfig(
        model_dir=args.model_dir,
        train_distribute=distribution,
        save_summary_steps=hparams.save_summary_steps,
        save_checkpoints_steps=hparams.save_checkpoints_steps,
        keep_checkpoint_max=hparams.n_checkpoints)
    estimator = tf.estimator.Estimator(model_fn=model.build_model_fn(hparams),
                                       config=run_config,
                                       model_dir=args.model_dir)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def infer(checkpoint_path, waveglow_path, text, save_path):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    # checkpoint_path = "tacotron2_statedict.pt"
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()


    # waveglow_path = 'waveglow_256channels.pt'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    # denoiser = Denoiser(waveglow)

    # text = "Waveglow is really awesome!"
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    plot_data((mel_outputs.float().data.cpu().numpy()[0],
               mel_outputs_postnet.float().data.cpu().numpy()[0],
               alignments.float().data.cpu().numpy()[0].T), save_path=save_path)

    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet.half(), sigma=0.666)
    # ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    print(audio.shape)

    write(os.path.join(save_path, 'test{}.wav'.format(1)),
          hparams.sampling_rate, audio)
Beispiel #24
0
 def reload_model(self):
     TTmodel_fpath = self.get_current_TTmodel_dir()
     WGmodel_fpath = self.get_current_WGmodel_dir()
     # Setup hparams
     self.hparams = create_hparams()
     self.hparams.sampling_rate = 22050
     # Load Tacotron 2 from checkpoint
     self.model = load_model(self.hparams, self.use_cuda)
     device = torch.device('cuda' if self.use_cuda else 'cpu')
     self.model.load_state_dict(
         torch.load(TTmodel_fpath, map_location=device)['state_dict'])
     if self.use_cuda:
         _ = self.model.cuda().eval().half()
     else:
         _ = self.model.eval()
     #  Load WaveGlow for mel2audio synthesis and denoiser
     self.waveglow = torch.load(WGmodel_fpath, map_location=device)['model']
     self.waveglow.use_cuda = self.use_cuda
     if self.use_cuda:
         self.waveglow.cuda().eval().half()
     else:
         self.waveglow.eval()
     for k in self.waveglow.convinv:
         k.float()
Beispiel #25
0
                        type=int,
                        default=0,
                        required=False,
                        help='rank of current gpu')
    parser.add_argument('--group_name',
                        type=str,
                        default='group_name',
                        required=False,
                        help='Distributed group name')
    parser.add_argument('--hparams',
                        type=str,
                        required=False,
                        help='comma separated name=value pairs')

    args = parser.parse_args()
    hparams = create_hparams(args.hparams)

    torch.backends.cudnn.enabled = hparams.cudnn_enabled
    torch.backends.cudnn.benchmark = hparams.cudnn_benchmark

    print("FP16 Run:", hparams.fp16_run)
    print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
    print("Distributed Run:", hparams.distributed_run)
    print("cuDNN Enabled:", hparams.cudnn_enabled)
    print("cuDNN Benchmark:", hparams.cudnn_benchmark)
    print("Time warping: ", hparams.mel_time_warping)
    print("Freq warping: ", hparams.mel_freq_warping)

    # train(args.output_directory, args.log_directory, args.checkpoint_path,
    #       args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
    train("./check_point", "./logs", None, args.warm_start, 4, args.rank,
Beispiel #26
0
from pathlib import Path
from functools import partial
from multiprocessing.pool import Pool
from matplotlib import pyplot as plt
from tqdm import tqdm
import collections as clt
import os
import re
import json
import numpy as np
import shutil

from data_utils import TextMelLoader
from hparams import create_hparams

hp = create_hparams()

metadata_path = None
text_mel_loader = None
output_dir = None


def format_index(index):
    return '{:06d}'.format(index)


def process_one(index, skip_existing=False):
    global text_mel_loader
    global metadata_path
    global output_dir
    if text_mel_loader is None:

def run(output_dir, ckpt_path):

    model = load_model(hparams)
    checkpoint_dict = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(checkpoint_dict['state_dict'])

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    model.eval()
    for batch in tqdm(train_loader):

        text, _, mel, _, _, _, fname = batch
        mel_pred, attn = model.inference((text.cuda(), mel.cuda()))

        output_fname = fname[0].replace('.wav', '-kkr2.mel')
        mel = mel_pred[0].data.cpu().numpy()
        np.save(output_fname, mel)


if __name__ == '__main__':

    output_dir = 'data-bin/mel_train-clean-100'
    ckpt_path = 'models/gst_tacotron_baseline_pretrained/checkpoint_45000'

    hparams = create_hparams()
    hparams.batch_size = 1
    run(output_dir, ckpt_path)
Beispiel #28
0
        default="",
        required=False,
        help="gpu's indices for distributed run (separated by commas)")
    parser.add_argument("--gpu_idx",
                        type=int,
                        default=0,
                        required=False,
                        help="device index for the current run")
    parser.add_argument("--group_name",
                        type=str,
                        default="group_name",
                        required=False,
                        help="Distributed group name")
    args = parser.parse_args()

    hparams = create_hparams(args.hparams_path)
    hparams.path = args.hparams_path

    n_gpus = 0
    rank = 0

    if args.distributed_run:
        assert args.gpus_ranks
        gpus_ranks = {
            elem: i
            for i, elem in enumerate(
                int(elem) for elem in args.gpus_ranks.split(","))
        }
        n_gpus = len(gpus_ranks)
        rank = gpus_ranks[args.gpu_idx]
Beispiel #29
0
tf.flags.DEFINE_string("test_file", "./data/test.tfrecords",
                       "Path of test data in TFRecords format")
tf.flags.DEFINE_string("model_dir", None,
                       "Directory to load model checkpoints from")
tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level")
tf.flags.DEFINE_integer("test_batch_size", 16, "Batch size for testing")
FLAGS = tf.flags.FLAGS

if not FLAGS.model_dir:
    print("You must specify a model directory")
    sys.exit(1)

tf.logging.set_verbosity(FLAGS.loglevel)

if __name__ == "__main__":
    hparams = hparams.create_hparams()
    model_fn = model.create_model_fn(hparams, model_impl=dual_encoder_model)
    estimator = tf.contrib.learn.Estimator(model_fn=model_fn,
                                           model_dir=FLAGS.model_dir,
                                           config=tf.contrib.learn.RunConfig())

    input_fn_test = inputs.create_input_fn(mode=tf.contrib.learn.ModeKeys.EVAL,
                                           input_files=[FLAGS.test_file],
                                           batch_size=FLAGS.test_batch_size,
                                           num_epochs=1)

    eval_metrics = metrics.create_evaluation_metrics()
    estimator.evaluate(input_fn=input_fn_test,
                       steps=None,
                       metrics=eval_metrics)
Beispiel #30
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    hparams = create_hparams()
    for path in [args.train_log_dir]:
        if not tf.gfile.Exists(path):
            tf.gfile.MakeDirs(path)
    hparams_filename = os.path.join(args.train_log_dir, 'hparams.json')
    with tf.gfile.FastGFile(hparams_filename, 'w') as f:
        f.write(hparams.to_json())
    with tf.Graph().as_default():
        with tf.device(tf.train.replica_device_setter(args.task_id)):
            global_step = tf.train.get_or_create_global_step()
            colors, depths, labels, label_augs = get_dataset(
                args.dataset_dir, args.num_readers,
                args.num_preprocessing_threads, hparams)
            net, end_points = model(colors,
                                    depths,
                                    num_classes=3,
                                    num_channels=1000,
                                    is_training=True,
                                    global_pool=False,
                                    output_stride=16,
                                    spatial_squeeze=False,
                                    color_scope='color_tower',
                                    depth_scope='depth_tower',
                                    scope='arcnet')
            loss = create_loss(net, labels, hparams.lamb)
            # loss = create_loss_without_background(net, labels)
            learning_rate = hparams.learning_rate
            if hparams.lr_decay_step:
                learning_rate = tf.train.exponential_decay(
                    hparams.learning_rate,
                    tf.train.get_or_create_global_step(),
                    decay_steps=hparams.lr_decay_step,
                    decay_rate=hparams.lr_decay_rate,
                    staircase=True)
            tf.summary.scalar('Learning_rate', learning_rate)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            train_op = slim.learning.create_train_op(loss, optimizer)
            add_summary(colors,
                        depths,
                        labels,
                        end_points,
                        loss,
                        scope='arcnet')
            summary_op = tf.summary.merge_all()
            if not args.from_arcnet_checkpoint:
                color_variable_map, depth_variable_map = restore_from_classification_checkpoint(
                    color_scope='color_tower',
                    depth_scope='depth_tower',
                    model_name=hparams.model_name,
                    checkpoint_exclude_scopes=['arcnet'])
                color_saver = tf.train.Saver(color_variable_map)
                depth_saver = tf.train.Saver(depth_variable_map)

                def initializer_fn(sess):
                    color_saver.restore(
                        sess,
                        os.path.join(args.checkpoint_dir,
                                     hparams.model_name + '.ckpt'))
                    depth_saver.restore(
                        sess,
                        os.path.join(args.checkpoint_dir,
                                     hparams.model_name + '.ckpt'))
                    tf.logging.info('Successfully load pretrained checkpoint.')

                init_fn = initializer_fn
            else:
                variable_map = restore_map()
                init_saver = tf.train.Saver(variable_map)

                def initializer_fn(sess):
                    init_saver.restore(
                        sess, tf.train.latest_checkpoint(args.checkpoint_dir))
                    tf.logging.info('Successfully load pretrained checkpoint.')

                init_fn = initializer_fn
            session_config = tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=False)
            session_config.gpu_options.allow_growth = True
            saver = tf.train.Saver(
                keep_checkpoint_every_n_hours=args.save_interval_secs,
                max_to_keep=100)

            slim.learning.train(train_op,
                                logdir=args.train_log_dir,
                                master=args.master,
                                global_step=global_step,
                                session_config=session_config,
                                init_fn=init_fn,
                                summary_op=summary_op,
                                number_of_steps=args.num_steps,
                                startup_delay_steps=15,
                                save_summaries_secs=args.save_summaries_steps,
                                saver=saver)