コード例 #1
0
ファイル: web_app.py プロジェクト: jireh-father/tacotron2
def init_model():
    print("init model!!!!")
    global tacotron2_model
    global waveglow_model
    global denoiser

    tacotron2_path = "outdir_finetune/checkpoint_62500"
    #    tacotron2_path = "outdir_korean/checkpoint_8800"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000"
    #    tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000"
    #    tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000"
    #    tacotron2_path = "outdir_lj_korean/checkpoint_5000"
    #    tacotron2_path = "outdir_longtrain/checkpoint_439500"
    waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000"
    #   waveglow_path = "../waveglow/checkpoints/waveglow_335000"
    # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000"
    sampling_rate = 22050
    denoiser_strength = 0.0
    hparams = create_hparams()
    hparams.sampling_rate = sampling_rate
    hparams.training = False

    tacotron2_model = load_model(hparams)
    tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
    _ = tacotron2_model.cuda().eval().half()

    # with open("waveglow/config.json") as f:
    #     data = f.read()
    # import json
    # config = json.loads(data)
    # waveglow_config = config["waveglow_config"]
    #
    # waveglow_model = glow.WaveGlow(**waveglow_config)
    #
    # checkpoint_dict = torch.load(waveglow_path, map_location='cpu')
    # model_for_loading = checkpoint_dict['model']
    # waveglow_model.load_state_dict(model_for_loading.state_dict())
    #
    # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict'])
    # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    # waveglow_model.cuda().eval().half()

    waveglow_model = torch.load(waveglow_path)['model']
    waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
    waveglow_model.cuda().eval().half()
    for k in waveglow_model.convinv:
        k.float()
    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow_model)
コード例 #2
0
def test_overlay_first_samples():
    hparams = create_hparams()

    # test 100 random combinations
    for _ in range(100):
        hparams.batch_factor = random.randint(1, 32)
        hparams.horizon = random.randint(1, 10)
        subscale = Subscaler(hparams)
        batch_dim = random.randint(1, 2)

        lensrc = subscale.context_len * random.randint(1, 10)
        indeces = torch.arange(lensrc).repeat(batch_dim, 1)
        pos = random.randint(0, 10)

        run_overlay(subscale, indeces, pos)
コード例 #3
0
 def tacotron2_init(self):
     self.plot_wav_data = False
     # set parameters
     self.hparams = create_hparams()
     self.hparams.sampling_rate = 22050
     # load tacotron2
     self.model = load_model(self.hparams)
     self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict'])
     _ = self.model.cuda().eval().half()
     # load waveglow
     self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model']
     self.waveglow.cuda().eval().half()
     for k in self.waveglow.convinv:
         k.float()
     self.denoiser = Denoiser(self.waveglow)
コード例 #4
0
def load_tts_vocoder_models(tacotron_checkpoint_path,
                            waveglow_checkpoint_path):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict'])
    _ = model.cuda().eval()

    waveglow = torch.load(waveglow_checkpoint_path)['model']
    waveglow.cuda().eval()
    #for k in waveglow.convinv:
    #    k.float()
    denoiser = Denoiser(waveglow)
    return model, waveglow, denoiser, hparams
コード例 #5
0
ファイル: inference.py プロジェクト: dodohow1011/waveglow_2
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    hparams = create_hparams()
    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    testset = TextMelLoader(text_files, hparams)
    collate_fn = TextMelCollate()

    test_loader = DataLoader(testset,
                             num_workers=0,
                             shuffle=False,
                             sampler=None,
                             batch_size=1,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=collate_fn)
    waveglow = torch.load(waveglow_path)['model']
    # waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, batch in enumerate(test_loader):
        text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
            batch)
        enc_outputs, _ = Taco2(
            (text_padded, input_lengths, mel_padded, max_len, output_lengths))
        # mel = torch.autograd.Variable(mel.cuda())
        # mel = torch.unsqueeze(mel, 0)
        # mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma)
            '''if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE'''
        # audio = audio.squeeze()
        # mel = mel.cpu().numpy()
        # audio = audio.astype('int16')
        print(mel)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        print(mel_path)
コード例 #6
0
def test_stack_flatten_parity():
    hparams = create_hparams()
    for _ in range(100):
        hparams.batch_factor = random.randint(1, 32)
        hparams.horizon = random.randint(1, 10)
        seq_len = random.randint(1, 10)
        n_channels = random.randint(1, 1000)
        subscale = Subscaler(hparams)

        batch_dim = random.randint(1, 16)
        tensor = torch.rand(
            [batch_dim, seq_len * subscale.context_len, n_channels])

        permuted = subscale.stack_substensors(tensor)
        orig = subscale.flatten_subtensors(permuted)
        assert (torch.eq(tensor, orig).all())
コード例 #7
0
ファイル: main.py プロジェクト: zbn123/FM-FFM
def main(_):
    fh = FieldHandler(train_file_path=FLAGS.train_file_path,
                      category_columns=FLAGS.category_columns,
                      continuation_columns=FLAGS.continuation_columns)

    features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label)

    # features, labels, files_dict = dataGenerate(FLAGS.train_file_path)
    hparams = create_hparams(fh.field_nums, fh.feature_nums)

    train_input_fn = create_train_input_fn(features,
                                           label=labels,
                                           batch_size=hparams.batch_size,
                                           num_epochs=hparams.epoches)

    
    if hparams.model == "fm":
        model_fn = create_model_fn(FM)
    elif hparams.model == "ffm":
        if hparams.use_deep:
            tf.logging.warning("\n\n>>>>>>>>>>> use ffm model, ignore --use_deep params. <<<<<<<<<<<<<<<\n")
        model_fn = create_model_fn(FFM)
    else:
        raise ValueError("model is ffm or fm.")
    
    estimator = tf.estimator.Estimator(
        model_fn = model_fn,
        model_dir=FLAGS.model_path,
        params=hparams,
        config=tf.estimator.RunConfig(
            tf_random_seed=hparams.seed,
            log_step_count_steps=500
        )
    )

    show_dict = {
        "loss":"loss",
        "accuracy":"accuracy/value",
        "auc":"auc/value"
    }
   
    log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100)
    # estimator.train(input_fn=train_input_fn, hooks=[log_hook])

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=train_input_fn, )
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
コード例 #8
0
ファイル: inference.py プロジェクト: zrb250/tacotron2
def main():
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = get_Tacotron2(hparams)
    waveglow = get_WaveGlow()

    # text = "Waveglow is really awesome!"
    texts = [
        "PRIH1NTIH0NG , IH0N TH AO1NLIY0 SEH1NS WIH1TH HHWIH1CH W AA1R AE1T PRIY0ZEH1NT KAH0NSER1ND , DIH1FER0Z FRAH1M MOW2ST IH1F NAA1T FRAH1M AH0L TH AA1RTS AE1ND KRAE1FTS REH2PRIH0ZEH1NTIH0D IH0N TH EH2KSAH0BIH1SHAH0N",
        "AE1ND DIH0TEY1LIH0NG PAH0LIY1S IH0N SAH0VIH1LYAH0N KLOW1DHZ TOW0 B SKAE1TER0D THRUW0AW1T TH SAY1ZAH0BAH0L KRAW1D .",
        "AY1 LAH1V YUW1 VEH1RIY0 MAH1CH",
        "SAY1AH0NTIH0STS AE1T TH SER1N LAE1BRAH0TAO2RIY0 SEY1 DHEY1 HHAE1V DIH0SKAH1VER0D AH0 NUW1 PAA1RTAH0KAH0L .",
        "PREH1ZIH0DAH0NT TRAH1MP MEH1T WIH1TH AH1DHER0 LIY1DER0Z AE1T TH GRUW1P AH1V TWEH1NTIY0 KAA1NFER0AH0NS .",
        "LEH1TS GOW1 AW2T TOW0 TH EH1RPAO2RT . TH PLEY1N LAE1NDAH0D TEH1N MIH1NAH0TS AH0GOW2 .",
        "IH0N BIY1IH0NG KAH0MPEH1RAH0TIH0VLIY0 MAA1DER0N .", "VIH1PKIH0D",
        "VIH1P KIH0D"
    ]

    if not os.path.exists("results"):
        os.mkdir("results")

    for text in texts:
        sequence = np.array(text_to_sequence(text,
                                             ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).cuda().long()

        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
            sequence)
        plot_data((mel_outputs.float().data.cpu().numpy()[0],
                   mel_outputs_postnet.float().data.cpu().numpy()[0],
                   alignments.float().data.cpu().numpy()[0].T), text[:10])

        #print("mel_out:", mel_outputs)
        #print("mel_out_postnet:", mel_outputs_postnet)
        #print("alignments:", alignments)

        with torch.no_grad():
            audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
            audio = audio * hparams.max_wav_value
        audio = audio.squeeze()
        audio = audio.cpu().numpy()
        audio = audio.astype('int16')
        write("results/{}_synthesis.wav".format(text), hparams.sampling_rate,
              audio)
        print("complete:", text)
コード例 #9
0
def main():
    parse = argparse.ArgumentParser()
    parse.add_argument(
        '-f',
        '--feature_file',
        type=str,
        default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32',
        help='features file to train')
    parse.add_argument(
        '-o',
        '--out_file',
        type=str,
        default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32.s16',
        help='features file to train')
    args = parse.parse_args()
    hparams = create_hparams()
    synthesis(args, hparams)
コード例 #10
0
def load_latest_model_from(location):

    files = [location + "/" + f for f in os.listdir(location)]
    newest_file = max(files, key=os.path.getctime)

    print("load model " + newest_file)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    hparam = hparams.create_hparams()

    model = WaveNetModel(hparam, device).to(device)
    if torch.cuda.is_available():
        states = torch.load(newest_file)
    else:
        states = torch.load(newest_file, map_location='cpu')
    model.load_state_dict(states['state_dict'])
    return model
コード例 #11
0
def load_mel(path):
    hparams = create_hparams()
    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cpu()
    return melspec
コード例 #12
0
 def __init__(self, ckpt, wglw, n_speakers=123):
     print("[Loading Model]")
     self.ckpt = ckpt
     self.hparams = create_hparams()
     self.hparams.n_speakers = n_speakers
     self.stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              self.hparams.n_mel_channels,
                              self.hparams.sampling_rate,
                              self.hparams.mel_fmin, self.hparams.mel_fmax)
     self.mellotron = load_model(self.hparams).cuda().eval()
     self.waveglow = torch.load(wglw)['model'].cuda().eval()
     self.denoiser = Denoiser(self.waveglow).cuda().eval()
     self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
     self.mellotron.load_state_dict(torch.load(ckpt)['state_dict'])
     print('[Loaded Model]')
コード例 #13
0
def main(unused_arg):
    model_fn = model.create_model_fn(hp.create_hparams())

    estimator = tf.contrib.learn.Estimator(model_fn=model_fn,
                                           model_dir=FLAGS.model_dir,
                                           config=tf.contrib.learn.RunConfig())

    input_fn = input.create_input_fn([TEST_FILE_PATH],
                                     tf.crontrib.learn.ModeKeys.EVAL,
                                     FLAGS.test_batch_size, 1)

    eval_metrics = metrics.create_evaluation_metrics()

    estimator.evaluate(input_fn=input_fn,
                       batch_size=FLAGS.test_batch_size,
                       metrics=eval_metrics,
                       steps=None)
コード例 #14
0
ファイル: inference.py プロジェクト: q-hwang/tacotron2
def main(text, checkpoint_path, path, name):
    #### Setup hparams
    hparams = create_hparams("distributed_run=False,mask_padding=False")
    hparams.filter_length = 1024
    hparams.hop_length = 256
    hparams.win_length = 1024

    #### Load model from checkpoint
    model = get_model(hparams, checkpoint_path)

    #### Prepare text input
    sequence = get_input(get_pinyin(text))

    #### inference
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, drop_prob=0.25)

    #### tacotron result
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, 60)
    write(
        os.path.join(path, name) + '_tacotron.wav', 16000,
        waveform[0].data.cpu().numpy())

    #### transform tacotron mel to wavenet mel
    wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T)

    #### save
    np.save(
        os.path.join(path, name) + '_mel.npy',
        mel_outputs_postnet.data.cpu().numpy()[0])
    np.save(
        os.path.join(path, name) + '_alig.npy',
        alignments.data.cpu().numpy()[0])
    np.save(os.path.join(path, name) + '.npy', wavenet_mel)
コード例 #15
0
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    index = 1

    tacotron_model = model(create_hparams(), './output/checkpoint_20500')

    with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
        for line in tqdm(f):
            parts = line.strip().split('|')
            wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
            text = parts[2]
            futures.append(
                executor.submit(
                    partial(_process_utterance, out_dir, index, wav_path, text,
                            tacotron_model)))
            index += 1
    return [future.result() for future in tqdm(futures)]
コード例 #16
0
    def __init__(self, lang):
        self.language = lang
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        with open('config.json', 'r') as f:
            self.config = json.load(f)

        self.waveglow_path = self.config.get('model').get('waveglow')
        self.waveglow = torch.load(self.waveglow_path)['model']
        self.waveglow.cuda().eval().half()

        for m in self.waveglow.modules():
            if 'Conv' in str(type(m)):
                setattr(m, 'padding_mode', 'zeros')
                
        for k in self.waveglow.convinv:
            k.float()
        self.denoiser = Denoiser(self.waveglow)
        self.update_model(lang)
コード例 #17
0
def getAudio(text):
    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    waveglow_path = os.path.join(__location__, 'waveglow_256channel.pt')
    waveglow = torch.load(waveglow_path,map_location='cpu')['model']
    waveglow.cpu().eval()
    for k in waveglow.convinv:
        k.float()
    #denoiser = Denoiser(waveglow)

    checkpoint_path = os.path.join(__location__, "checkpoint_9000")
    model = load_model(hparams)
    #print(model)
    state = torch.load(checkpoint_path,map_location='cpu')['state_dict']
    #print(state)
    model.load_state_dict(state)
    _ = model.cpu().eval()

    #text = "Bộ Y tế chỉ đạo Viện Vệ sinh dịch tễ và các địa phương điều tra dịch tễ các trường hợp có kết quả xét nghiệm dương tính, xác minh người tiếp xúc gần với bệnh nhân dương tính, khoanh vùng xử lý ổ dịch và cách ly theo dõi sức khỏe những người tiếp xúc."
    text = TTSnorm(text)
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()



    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    #plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T))


    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    #########
    #Thieu phan denoiser do phai chay tren gpu
    #########
        
    
    text_hashed=abs(hash(text)) % (10 ** 8)
    sd.write("static/audio/"+str(text_hashed)+'.wav',audio[0].data.cpu().numpy(), 22050)
    return text
コード例 #18
0
    def __init__(self, model_choice):
        self.model_choice = model_choice
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        with open('config.json', 'r') as f:
            self.config = json.load(f)
        self.max_duration_s = self.config.get('max_duration_s')
        self.hparams.max_decoder_steps = int(86.0 * self.max_duration_s)

        self.waveglow = torch.load('models/waveglow',
                                   map_location=torch.device('cpu'))['model']
        self.waveglow.eval()

        for m in self.waveglow.modules():
            if 'Conv' in str(type(m)):
                setattr(m, 'padding_mode', 'zeros')

        for k in self.waveglow.convinv:
            k.float()
        #self.denoiser = Denoiser(self.waveglow)
        self.update_model(model_choice, self.max_duration_s)
コード例 #19
0
ファイル: synthesizer.py プロジェクト: t108318104/HW3
    def load(self, tacotron_model, waveglow_model):
        # setting
        self.project_name = 'tacotron2'
        sys.path.append(self.project_name)
        sys.path.append(join(self.project_name, 'waveglow/'))

        # initialize Tacotron2
        self.hparams = create_hparams()
        self.hparams.sampling_rate = 22050
        self.hparams.max_decoder_steps = 1000
        self.hparams.fp16_run = True

        self.tacotron = Tacotron2(self.hparams)
        self.tacotron.load_state_dict(torch.load(tacotron_model)['state_dict'])
        _ = self.tacotron.cuda().eval()

        self.waveglow = torch.load(waveglow_model)['model']
        self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
        _ = self.waveglow.cuda().eval()
        for k in self.waveglow.convinv:
            k.float()
コード例 #20
0
def synth(models, text, out):
    hparams = create_hparams()

    checkpoint_path = models + '/tacotron2'
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.eval()

    waveglow_path = models + '/waveglow'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda()

    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    with torch.no_grad():
        audio = 32768.0 * waveglow.infer(mel_outputs_postnet, sigma=0.666)[0]

    audio = audio.cpu().numpy()
    audio = audio.astype('int16')
    write(out, 8000, audio)
コード例 #21
0
def infer(checkpoint_path, griffin_iters, text, out_filename):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval()  #.half()

    sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                        taco_stft.stft_fn, griffin_iters)

    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    #audio = audio.astype('int16')
    audio_path = os.path.join('samples',
                              "{}_synthesis.wav".format(out_filename))
    write(audio_path, hparams.sampling_rate, audio)
    print(audio_path)
    plot_alignment_to_numpy(
        alignments.squeeze().cpu().detach().numpy().T,
        os.path.join('samples', "{}_attention.png".format(out_filename)))
コード例 #22
0
def main(argv):
    args = utils.parse_args("Train a transformer model")
    utils.redirect_log_to_file(args.model_dir)

    hparams = create_hparams(args.model_dir, args.configs, initialize=True)
    utils.check_git_hash(args.model_dir)

    # Prepare data
    data.load_vocab(hparams)

    train_input_fn = data.InputPipeline(None, None, hparams.record_train_file,
                                        tf.estimator.ModeKeys.TRAIN, hparams)
    eval_input_fn = data.InputPipeline(None, None, hparams.record_eval_file,
                                       tf.estimator.ModeKeys.EVAL, hparams)

    # Training
    log_samples_hook = tf.train.LoggingTensorHook(
        ['targets', 'predictions'],
        at_end=True,
        formatter=tensors_to_string(hparams))

    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=hparams.train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      steps=hparams.eval_steps,
                                      hooks=[log_samples_hook])

    distribution = tf.contrib.distribute.MirroredStrategy()
    run_config = tf.estimator.RunConfig(
        model_dir=args.model_dir,
        train_distribute=distribution,
        save_summary_steps=hparams.save_summary_steps,
        save_checkpoints_steps=hparams.save_checkpoints_steps,
        keep_checkpoint_max=hparams.n_checkpoints)
    estimator = tf.estimator.Estimator(model_fn=model.build_model_fn(hparams),
                                       config=run_config,
                                       model_dir=args.model_dir)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
コード例 #23
0
def infer(checkpoint_path, waveglow_path, text, save_path):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    # checkpoint_path = "tacotron2_statedict.pt"
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()


    # waveglow_path = 'waveglow_256channels.pt'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    # denoiser = Denoiser(waveglow)

    # text = "Waveglow is really awesome!"
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    plot_data((mel_outputs.float().data.cpu().numpy()[0],
               mel_outputs_postnet.float().data.cpu().numpy()[0],
               alignments.float().data.cpu().numpy()[0].T), save_path=save_path)

    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet.half(), sigma=0.666)
    # ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    print(audio.shape)

    write(os.path.join(save_path, 'test{}.wav'.format(1)),
          hparams.sampling_rate, audio)
コード例 #24
0
ファイル: gui.py プロジェクト: lokkelvin2/tacotron2_GUI
 def reload_model(self):
     TTmodel_fpath = self.get_current_TTmodel_dir()
     WGmodel_fpath = self.get_current_WGmodel_dir()
     # Setup hparams
     self.hparams = create_hparams()
     self.hparams.sampling_rate = 22050
     # Load Tacotron 2 from checkpoint
     self.model = load_model(self.hparams, self.use_cuda)
     device = torch.device('cuda' if self.use_cuda else 'cpu')
     self.model.load_state_dict(
         torch.load(TTmodel_fpath, map_location=device)['state_dict'])
     if self.use_cuda:
         _ = self.model.cuda().eval().half()
     else:
         _ = self.model.eval()
     #  Load WaveGlow for mel2audio synthesis and denoiser
     self.waveglow = torch.load(WGmodel_fpath, map_location=device)['model']
     self.waveglow.use_cuda = self.use_cuda
     if self.use_cuda:
         self.waveglow.cuda().eval().half()
     else:
         self.waveglow.eval()
     for k in self.waveglow.convinv:
         k.float()
コード例 #25
0
                        type=int,
                        default=0,
                        required=False,
                        help='rank of current gpu')
    parser.add_argument('--group_name',
                        type=str,
                        default='group_name',
                        required=False,
                        help='Distributed group name')
    parser.add_argument('--hparams',
                        type=str,
                        required=False,
                        help='comma separated name=value pairs')

    args = parser.parse_args()
    hparams = create_hparams(args.hparams)

    torch.backends.cudnn.enabled = hparams.cudnn_enabled
    torch.backends.cudnn.benchmark = hparams.cudnn_benchmark

    print("FP16 Run:", hparams.fp16_run)
    print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
    print("Distributed Run:", hparams.distributed_run)
    print("cuDNN Enabled:", hparams.cudnn_enabled)
    print("cuDNN Benchmark:", hparams.cudnn_benchmark)
    print("Time warping: ", hparams.mel_time_warping)
    print("Freq warping: ", hparams.mel_freq_warping)

    # train(args.output_directory, args.log_directory, args.checkpoint_path,
    #       args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
    train("./check_point", "./logs", None, args.warm_start, 4, args.rank,
コード例 #26
0
ファイル: preprocess.py プロジェクト: zuiwanting/zhrtvc
from pathlib import Path
from functools import partial
from multiprocessing.pool import Pool
from matplotlib import pyplot as plt
from tqdm import tqdm
import collections as clt
import os
import re
import json
import numpy as np
import shutil

from data_utils import TextMelLoader
from hparams import create_hparams

hp = create_hparams()

metadata_path = None
text_mel_loader = None
output_dir = None


def format_index(index):
    return '{:06d}'.format(index)


def process_one(index, skip_existing=False):
    global text_mel_loader
    global metadata_path
    global output_dir
    if text_mel_loader is None:
コード例 #27
0

def run(output_dir, ckpt_path):

    model = load_model(hparams)
    checkpoint_dict = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(checkpoint_dict['state_dict'])

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    model.eval()
    for batch in tqdm(train_loader):

        text, _, mel, _, _, _, fname = batch
        mel_pred, attn = model.inference((text.cuda(), mel.cuda()))

        output_fname = fname[0].replace('.wav', '-kkr2.mel')
        mel = mel_pred[0].data.cpu().numpy()
        np.save(output_fname, mel)


if __name__ == '__main__':

    output_dir = 'data-bin/mel_train-clean-100'
    ckpt_path = 'models/gst_tacotron_baseline_pretrained/checkpoint_45000'

    hparams = create_hparams()
    hparams.batch_size = 1
    run(output_dir, ckpt_path)
コード例 #28
0
        default="",
        required=False,
        help="gpu's indices for distributed run (separated by commas)")
    parser.add_argument("--gpu_idx",
                        type=int,
                        default=0,
                        required=False,
                        help="device index for the current run")
    parser.add_argument("--group_name",
                        type=str,
                        default="group_name",
                        required=False,
                        help="Distributed group name")
    args = parser.parse_args()

    hparams = create_hparams(args.hparams_path)
    hparams.path = args.hparams_path

    n_gpus = 0
    rank = 0

    if args.distributed_run:
        assert args.gpus_ranks
        gpus_ranks = {
            elem: i
            for i, elem in enumerate(
                int(elem) for elem in args.gpus_ranks.split(","))
        }
        n_gpus = len(gpus_ranks)
        rank = gpus_ranks[args.gpu_idx]
コード例 #29
0
ファイル: test.py プロジェクト: arjunchatterjee196/chatbot
tf.flags.DEFINE_string("test_file", "./data/test.tfrecords",
                       "Path of test data in TFRecords format")
tf.flags.DEFINE_string("model_dir", None,
                       "Directory to load model checkpoints from")
tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level")
tf.flags.DEFINE_integer("test_batch_size", 16, "Batch size for testing")
FLAGS = tf.flags.FLAGS

if not FLAGS.model_dir:
    print("You must specify a model directory")
    sys.exit(1)

tf.logging.set_verbosity(FLAGS.loglevel)

if __name__ == "__main__":
    hparams = hparams.create_hparams()
    model_fn = model.create_model_fn(hparams, model_impl=dual_encoder_model)
    estimator = tf.contrib.learn.Estimator(model_fn=model_fn,
                                           model_dir=FLAGS.model_dir,
                                           config=tf.contrib.learn.RunConfig())

    input_fn_test = inputs.create_input_fn(mode=tf.contrib.learn.ModeKeys.EVAL,
                                           input_files=[FLAGS.test_file],
                                           batch_size=FLAGS.test_batch_size,
                                           num_epochs=1)

    eval_metrics = metrics.create_evaluation_metrics()
    estimator.evaluate(input_fn=input_fn_test,
                       steps=None,
                       metrics=eval_metrics)
コード例 #30
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    hparams = create_hparams()
    for path in [args.train_log_dir]:
        if not tf.gfile.Exists(path):
            tf.gfile.MakeDirs(path)
    hparams_filename = os.path.join(args.train_log_dir, 'hparams.json')
    with tf.gfile.FastGFile(hparams_filename, 'w') as f:
        f.write(hparams.to_json())
    with tf.Graph().as_default():
        with tf.device(tf.train.replica_device_setter(args.task_id)):
            global_step = tf.train.get_or_create_global_step()
            colors, depths, labels, label_augs = get_dataset(
                args.dataset_dir, args.num_readers,
                args.num_preprocessing_threads, hparams)
            net, end_points = model(colors,
                                    depths,
                                    num_classes=3,
                                    num_channels=1000,
                                    is_training=True,
                                    global_pool=False,
                                    output_stride=16,
                                    spatial_squeeze=False,
                                    color_scope='color_tower',
                                    depth_scope='depth_tower',
                                    scope='arcnet')
            loss = create_loss(net, labels, hparams.lamb)
            # loss = create_loss_without_background(net, labels)
            learning_rate = hparams.learning_rate
            if hparams.lr_decay_step:
                learning_rate = tf.train.exponential_decay(
                    hparams.learning_rate,
                    tf.train.get_or_create_global_step(),
                    decay_steps=hparams.lr_decay_step,
                    decay_rate=hparams.lr_decay_rate,
                    staircase=True)
            tf.summary.scalar('Learning_rate', learning_rate)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            train_op = slim.learning.create_train_op(loss, optimizer)
            add_summary(colors,
                        depths,
                        labels,
                        end_points,
                        loss,
                        scope='arcnet')
            summary_op = tf.summary.merge_all()
            if not args.from_arcnet_checkpoint:
                color_variable_map, depth_variable_map = restore_from_classification_checkpoint(
                    color_scope='color_tower',
                    depth_scope='depth_tower',
                    model_name=hparams.model_name,
                    checkpoint_exclude_scopes=['arcnet'])
                color_saver = tf.train.Saver(color_variable_map)
                depth_saver = tf.train.Saver(depth_variable_map)

                def initializer_fn(sess):
                    color_saver.restore(
                        sess,
                        os.path.join(args.checkpoint_dir,
                                     hparams.model_name + '.ckpt'))
                    depth_saver.restore(
                        sess,
                        os.path.join(args.checkpoint_dir,
                                     hparams.model_name + '.ckpt'))
                    tf.logging.info('Successfully load pretrained checkpoint.')

                init_fn = initializer_fn
            else:
                variable_map = restore_map()
                init_saver = tf.train.Saver(variable_map)

                def initializer_fn(sess):
                    init_saver.restore(
                        sess, tf.train.latest_checkpoint(args.checkpoint_dir))
                    tf.logging.info('Successfully load pretrained checkpoint.')

                init_fn = initializer_fn
            session_config = tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=False)
            session_config.gpu_options.allow_growth = True
            saver = tf.train.Saver(
                keep_checkpoint_every_n_hours=args.save_interval_secs,
                max_to_keep=100)

            slim.learning.train(train_op,
                                logdir=args.train_log_dir,
                                master=args.master,
                                global_step=global_step,
                                session_config=session_config,
                                init_fn=init_fn,
                                summary_op=summary_op,
                                number_of_steps=args.num_steps,
                                startup_delay_steps=15,
                                save_summaries_secs=args.save_summaries_steps,
                                saver=saver)