コード例 #1
0
def main():
    with tf.variable_scope('data'):
        inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32)
        inp_mask = tf.placeholder(name='inp_mask',
                                  shape=(None, ),
                                  dtype=tf.int32)
        seq2seq_gtruth = tf.placeholder(name='seq2seq_gtruth',
                                        shape=(None, None, hp.seq2seq_dim),
                                        dtype=tf.float32)
        post_gtruth = tf.placeholder(name='post_gtruth',
                                     shape=(None, None, hp.post_dim),
                                     dtype=tf.float32)

    train_meta_path = pkl_train_path
    assert os.path.exists(train_meta_path),\
        '[!] Train meta not exists! PATH: {}'.format(train_meta_path)

    dev_meta_path = pkl_dev_path
    assert os.path.exists(dev_meta_path), \
        '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path)

    with open(train_meta_path, 'rb') as f:
        train_meta = pkl.load(f)
        train_meta['reduction_rate'] = hp.reduction_rate

    with open(dev_meta_path, 'rb') as f:
        dev_meta = pkl.load(f)
        dev_meta['reduction_rate'] = hp.reduction_rate

    train_model = Tacotron(inp=inp,
                           inp_mask=inp_mask,
                           seq2seq_gtruth=seq2seq_gtruth,
                           post_gtruth=post_gtruth,
                           hyper_params=hp,
                           training=True,
                           reuse=False)

    with tf.variable_scope('optimizer'):
        opt = tf.train.AdamOptimizer(
            train_model.exp_learning_rate_decay(0.001))
        # grad, var = zip(*opt.compute_gradients(train_model.loss))
        # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        #     train_upd = opt.apply_gradients(zip(grad, var), global_step=train_model.global_step)

        grads_and_vars = opt.compute_gradients(train_model.loss)
        for i, (grad, var) in enumerate(grads_and_vars):
            # print(var.name)
            if var.name.find('style_token:0') != -1:
                grads_and_vars[i] = (grad * 0, var)
                print(var.name)
                print('hhhh time')
                break
        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            train_upd = opt.apply_gradients(
                grads_and_vars, global_step=train_model.global_step)

    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(tb_logs_path):
        os.makedirs(tb_logs_path)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        train_model.sess = sess
        writer = tf.summary.FileWriter(tb_logs_path, filename_suffix='train')
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        ckpt = tf.train.get_checkpoint_state(save_path)
        saver = tf.train.Saver(max_to_keep=20)
        train_model.saver = saver
        ass_style_token = tf.placeholder(name="ass_style_token",
                                         shape=(1, hp.styles_kind,
                                                hp.style_dim),
                                         dtype=tf.float32)
        ass_opt = train_model.single_style_token.assign(ass_style_token)
        if ckpt:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(save_path, ckpt_name))
            print('restore path:', ckpt_name)
        else:
            print('no restor, init all include style:')
            # np.random.seed(1)
            init_style_token = np.random.uniform(low=-1,
                                                 high=1,
                                                 size=(1, hp.styles_kind,
                                                       hp.style_dim))
            print('look random:', np.max(init_style_token),
                  np.min(init_style_token))
            sess.run(ass_opt, feed_dict={ass_style_token: init_style_token})

        train_next_item = init_next_batch(tfrecord_train_path, 7001, 2000)
        # dev_next_item = init_next_batch(tfrecord_dev_path, 1, 2000)

        train_scalar_summary = train_model.get_scalar_summary('train')
        train_alpha_summary = train_model.get_alpha_summary('train', 2)
        dev_loss_holder = tf.placeholder(shape=(),
                                         dtype=tf.float32,
                                         name='dev_loss')
        dev_loss_summary = tf.summary.scalar('dev_loss_summary',
                                             dev_loss_holder)
        pred_audio_holder = tf.placeholder(shape=(None, None),
                                           dtype=tf.float32,
                                           name='pred_audio')
        pred_audio_summary = tf.summary.audio('pred_audio_summary',
                                              pred_audio_holder,
                                              sample_rate=hp.sample_rate,
                                              max_outputs=30)

        already_step_eval = sess.run(train_model.global_step)
        try:
            for cnt in tqdm.tqdm(
                    range(already_step_eval + 1, hp.max_global_steps + 10)):
                # print('now is', cnt)
                pre_time = time.time()
                batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch(
                    sess, train_next_item)
                # print('bug', batch_inp[0], 'len', batch_inp_mask[0], 'actual', batch_inp[0].shape)
                batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch(
                    batch_inp, batch_inp_mask, batch_mel_gtruth,
                    batch_spec_gtruth, train_meta)
                # print(batch_mel_gtruth.shape[1], batch_inp[0][0])
                # print(batch_inp_mask)
                # print('look', batch_mel_gtruth[0], batch_spec_gtruth[0])
                train_time = time.time()
                # print('pre time:', train_time - pre_time)

                _, loss_eval, global_step_eval = sess.run(
                    [train_upd, train_model.loss, train_model.global_step],
                    feed_dict={
                        inp: batch_inp,
                        inp_mask: batch_inp_mask,
                        seq2seq_gtruth: batch_mel_gtruth,
                        post_gtruth: batch_spec_gtruth
                    })
                # print('step:', global_step_eval)

                if cnt % 50 == 0:
                    # if cnt % 5 == 0:
                    summary_str = sess.run(train_scalar_summary,
                                           feed_dict={
                                               inp: batch_inp,
                                               inp_mask: batch_inp_mask,
                                               seq2seq_gtruth:
                                               batch_mel_gtruth,
                                               post_gtruth: batch_spec_gtruth
                                           })
                    writer.add_summary(summary_str, global_step_eval)
                if cnt % 200 == 0:  #about one epoch
                    # if cnt % 10 == 0:#about one epoch
                    summary_str = sess.run(train_alpha_summary,
                                           feed_dict={
                                               inp: batch_inp,
                                               inp_mask: batch_inp_mask,
                                               seq2seq_gtruth:
                                               batch_mel_gtruth,
                                               post_gtruth: batch_spec_gtruth
                                           })
                    writer.add_summary(summary_str, global_step_eval)
                    dev_loss = 0
                    dev_batches_per_epoch = 0
                    dev_next_item = init_next_batch(
                        tfrecord_dev_path, 1000,
                        1)  #use the last batch to listen feel
                    while True:
                        try:
                            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch(
                                sess, dev_next_item)
                            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch(
                                batch_inp, batch_inp_mask, batch_mel_gtruth,
                                batch_spec_gtruth, dev_meta)
                            _loss = sess.run(train_model.loss,
                                             feed_dict={
                                                 inp: batch_inp,
                                                 inp_mask: batch_inp_mask,
                                                 seq2seq_gtruth:
                                                 batch_mel_gtruth,
                                                 post_gtruth: batch_spec_gtruth
                                             })
                            dev_loss += _loss
                            dev_batches_per_epoch += 1
                        except:
                            dev_loss /= dev_batches_per_epoch
                            dev_loss_summary_str = sess.run(
                                dev_loss_summary,
                                feed_dict={dev_loss_holder: dev_loss})
                            writer.add_summary(dev_loss_summary_str,
                                               global_step_eval)
                            break
                if cnt % 2000 == 0:
                    # if cnt % 15 == 0:
                    train_model.save(save_path, global_step_eval)
                    all_pred_out = []
                    trained_style_token = sess.run(
                        train_model.single_style_token)
                    for style_no in range(11):
                        unique_style_token = get_style_token(
                            trained_style_token, style_no)
                        sess.run(
                            ass_opt,
                            feed_dict={ass_style_token: unique_style_token})
                        pred_out = sess.run(train_model.post_output,
                                            feed_dict={
                                                inp: batch_inp,
                                                inp_mask: batch_inp_mask,
                                                seq2seq_gtruth:
                                                batch_mel_gtruth,
                                                post_gtruth: batch_spec_gtruth
                                            })
                        pred_out = pred_out * train_meta[
                            "log_stftm_std"] + train_meta["log_stftm_mean"]
                        for audio_i in range(3):
                            pred_audio, exp_spec = audio.invert_spectrogram(
                                pred_out[audio_i], 1.2)
                            pred_audio = np.reshape(pred_audio,
                                                    (1, pred_audio.shape[-1]))
                            all_pred_out.append(pred_audio)
                    inp_all_pred_out = []
                    for m in range(3):
                        for x in range(30):
                            if x % 3 == m:
                                inp_all_pred_out.append(all_pred_out[x])

                    all_pred_out = np.concatenate(inp_all_pred_out, axis=0)

                    pred_audio_summary_str = sess.run(
                        pred_audio_summary,
                        feed_dict={pred_audio_holder: all_pred_out})
                    writer.add_summary(pred_audio_summary_str,
                                       global_step_eval)
                    sess.run(ass_opt,
                             feed_dict={ass_style_token: trained_style_token})

                post_time = time.time()

                # print('train time:', post_time - train_time)

        except Exception as e:
            print('Training stopped', str(e))
コード例 #2
0
def main():

    with tf.variable_scope('data'):
        inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32)
        inp_mask = tf.placeholder(name='inp_mask',
                                  shape=(None, ),
                                  dtype=tf.int32)
        decode_time_steps = tf.placeholder(name='decode_time_steps',
                                           shape=(),
                                           dtype=tf.int32)
        ctr_flag = tf.placeholder(name='ctr_flag', shape=(), dtype=tf.int32)
        style_attention = tf.placeholder(name='style_att',
                                         shape=(None, 10),
                                         dtype=tf.float32)

    dev_meta_path = pkl_dev_path
    assert os.path.exists(dev_meta_path), \
        '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path)

    with open(dev_meta_path, 'rb') as f:
        dev_meta = pkl.load(f)
        dev_meta['reduction_rate'] = hp.reduction_rate
    print(dev_meta.keys())
    dev_char_map = dev_meta['char_map']

    txt = [
        "She glanced at his newspaper, then stopped and stared.",
        "I think you'll have to marry Count Paris.",
        "My house is the best of all!"
    ]
    # print('**', txt[0][0])
    max_txt_len = 0
    for i in range(len(txt)):
        max_txt_len = max(max_txt_len, len(txt[i]))
    txt_inp = []
    for i in range(len(txt)):
        txt_inp_a = []
        for j in range(len(txt[i])):
            # print('---:', txt[i][j])
            txt_inp_a.append(dev_char_map[txt[i][j]])
        for j in range(len(txt[i]), max_txt_len):
            txt_inp_a.append(0)
        txt_inp.append(txt_inp_a)
    txt_inp = np.asarray(txt_inp)

    # print(txt_inp)
    txt_mask = []
    for i in range(len(txt)):
        txt_mask.append(len(txt[i]))
    txt_mask = np.asarray(txt_mask)
    # print(txt_mask)

    model = Tacotron(inp,
                     inp_mask,
                     decode_time_steps,
                     ctr_flag,
                     style_attention,
                     hyper_params=hp)

    dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size)
    if not os.path.exists(generate_path):
        os.makedirs(generate_path)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model.sess = sess
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        ckpt = tf.train.get_checkpoint_state(save_path)

        saver = tf.train.Saver(max_to_keep=20)
        model.saver = saver
        ass_style_token = tf.placeholder(name="ass_style_token",
                                         shape=(1, hp.styles_kind,
                                                hp.style_dim),
                                         dtype=tf.float32)
        ass_opt = model.single_style_token.assign(ass_style_token)
        if ckpt:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(save_path, ckpt_name))
            print('restore path:', ckpt_name)
        else:
            print('no restor, init all')

        #no ctr
        unique_style_attention = np.zeros([len(txt_inp), 10], dtype=np.float32)
        pred_out = sess.run(model.post_output,
                            feed_dict={
                                inp: txt_inp,
                                inp_mask: txt_mask,
                                decode_time_steps: 60,
                                ctr_flag: 0,
                                style_attention: unique_style_attention
                            })
        pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[
            "log_stftm_mean"]
        for j in range(len(txt_inp)):
            pred_audio, exp_spec = audio.invert_spectrogram(pred_out[j], 1.2)
            wav_folder = os.path.join(generate_path, data_name)
            if not os.path.exists(wav_folder):
                os.makedirs(wav_folder)
            siowav.write(
                os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, 100)),
                hp.sample_rate, pred_audio)
        #ctr, no style
        unique_style_attention = np.zeros([len(txt_inp), 10], dtype=np.float32)
        pred_out = sess.run(model.post_output,
                            feed_dict={
                                inp: txt_inp,
                                inp_mask: txt_mask,
                                decode_time_steps: 60,
                                ctr_flag: 1,
                                style_attention: unique_style_attention
                            })
        pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[
            "log_stftm_mean"]
        for j in range(len(txt_inp)):
            pred_audio, exp_spec = audio.invert_spectrogram(pred_out[j], 1.2)
            wav_folder = os.path.join(generate_path, data_name)
            if not os.path.exists(wav_folder):
                os.makedirs(wav_folder)
            siowav.write(
                os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, 200)),
                hp.sample_rate, pred_audio)

        #ctr, spec style
        for i in range(10):
            unique_style_attention = np.zeros([len(txt_inp), 10],
                                              dtype=np.float32)
            for j in range(len(txt_inp)):
                unique_style_attention[j][i] = 1
            pred_out = sess.run(model.post_output,
                                feed_dict={
                                    inp: txt_inp,
                                    inp_mask: txt_mask,
                                    decode_time_steps: 60,
                                    ctr_flag: 1,
                                    style_attention: unique_style_attention
                                })
            pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[
                "log_stftm_mean"]
            for j in range(len(txt_inp)):
                pred_audio, exp_spec = audio.invert_spectrogram(
                    pred_out[j], 1.2)
                wav_folder = os.path.join(generate_path, data_name)
                if not os.path.exists(wav_folder):
                    os.makedirs(wav_folder)
                siowav.write(
                    os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, i)),
                    hp.sample_rate, pred_audio)
コード例 #3
0
def main():

    with tf.variable_scope('data'):
        inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32)
        inp_mask = tf.placeholder(name='inp_mask',
                                  shape=(None, ),
                                  dtype=tf.int32)
        inp_id = tf.placeholder(name='inp_id', shape=(None, ), dtype=tf.int32)
        decode_time_steps = tf.placeholder(name='decode_time_steps',
                                           shape=(),
                                           dtype=tf.int32)
        # seq2seq_gtruth = tf.placeholder(name='seq2seq_gtruth', shape=(None, None, hp.seq2seq_dim), dtype=tf.float32)
        # post_gtruth = tf.placeholder(name='post_gtruth', shape=(None, None, hp.post_dim), dtype=tf.float32)

    dev_meta_path = pkl_dev_path
    assert os.path.exists(dev_meta_path), \
        '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path)

    with open(dev_meta_path, 'rb') as f:
        dev_meta = pkl.load(f)
        dev_meta['reduction_rate'] = hp.reduction_rate
    print(dev_meta.keys())
    dev_char_map = dev_meta['char_map']

    # txt = ["She glanced at his newspaper, then stopped and stared.",
    #        "I think you'll have to marry Count Paris.",
    #        "My house is the best of all!"]

    txt = [
        "She glanced at his newspaper, then stopped and stared.",
        "I think you'll have to marry Count Paris.",
        "My house is the best of all!"
    ]

    # print('**', txt[0][0])
    max_txt_len = 0
    for i in range(len(txt)):
        max_txt_len = max(max_txt_len, len(txt[i]))
    txt_inp = []
    for i in range(len(txt)):
        txt_inp_a = []
        for j in range(len(txt[i])):
            # print('---:', txt[i][j])
            txt_inp_a.append(dev_char_map[txt[i][j]])
        for j in range(len(txt[i]), max_txt_len):
            txt_inp_a.append(0)
        txt_inp.append(txt_inp_a)
    txt_inp = np.asarray(txt_inp)

    # print(txt_inp)
    txt_mask = []
    for i in range(len(txt)):
        txt_mask.append(len(txt[i]))
    txt_mask = np.asarray(txt_mask)
    # print(txt_mask)

    model = Tacotron(inp, inp_mask, inp_id, decode_time_steps, hyper_params=hp)

    dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size)
    if not os.path.exists(generate_path):
        os.makedirs(generate_path)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model.sess = sess
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        ckpt = tf.train.get_checkpoint_state(save_path)

        saver = tf.train.Saver(max_to_keep=20)
        model.saver = saver
        # ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim),
        #                                  dtype=tf.float32)
        # ass_opt = model.single_style_token.assign(ass_style_token)
        # ass_inp_att = tf.placeholder(name="ass_inp_att", shape=(None, hp.styles_kind),
        #                              dtype=tf.float32)
        # att_ass_opt = model.inp_att.assign(ass_inp_att)
        if ckpt:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(save_path, ckpt_name))
            print('restore path:', ckpt_name)
        else:
            print('no restor, init all')

        wav_folder = os.path.join(generate_path, data_name)

        with open('selected_data_10.pkl', 'rb') as f:
            selected_data = pkl.load(f)
        # print(selected_data['batch_id'], selected_data['batch_key'])
        # return
        print('no ctr')
        for id_no in range(len(selected_data['batch_id'])):
            now_id_no = selected_data['batch_id'][id_no]
            for i in range(len(selected_data['batch_id'])):
                pred_out = sess.run(model.post_output,
                                    feed_dict={
                                        inp: [selected_data['batch_inp'][i]],
                                        inp_mask:
                                        [selected_data['batch_inp_mask'][i]],
                                        inp_id: [now_id_no],
                                        decode_time_steps:
                                        60
                                    })
                pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[
                    "log_stftm_mean"]
                for j in range(1):
                    pred_audio, exp_spec = audio.invert_spectrogram(
                        pred_out[j], 1.2)
                    # wav_folder = os.path.join(generate_path, data_name)
                    if not os.path.exists(wav_folder):
                        os.makedirs(wav_folder)
                    siowav.write(
                        os.path.join(
                            wav_folder, "%s_audio%d_style_%d.wav" %
                            (selected_data['batch_char_txt'][i][:3],
                             selected_data['batch_id'][i], now_id_no)),
                        hp.sample_rate, pred_audio)

            pred_out = sess.run(model.post_output,
                                feed_dict={
                                    inp: txt_inp,
                                    inp_mask: txt_mask,
                                    inp_id: np.ones(len(txt)) * now_id_no,
                                    decode_time_steps: 60,
                                })

            pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[
                "log_stftm_mean"]
            for j in range(len(txt)):
                pred_audio, exp_spec = audio.invert_spectrogram(
                    pred_out[j], 1.2)
                # wav_folder = os.path.join(generate_path, data_name)
                if not os.path.exists(wav_folder):
                    os.makedirs(wav_folder)
                siowav.write(
                    os.path.join(wav_folder,
                                 "test_audio%d_style_%d.wav" % (j, now_id_no)),
                    hp.sample_rate, pred_audio)
        '''
コード例 #4
0
def main():

    with tf.variable_scope('data'):
        inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32)
        inp_mask = tf.placeholder(name='inp_mask', shape=(None,), dtype=tf.int32)
        decode_time_steps = tf.placeholder(name='decode_time_steps', shape=(), dtype=tf.int32)
        style_attention = tf.placeholder(name='style_att', shape=(None, 10), dtype=tf.float32)



    dev_meta_path = pkl_dev_path
    assert os.path.exists(dev_meta_path), \
        '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path)



    with open(dev_meta_path, 'rb') as f:
        dev_meta = pkl.load(f)
        dev_meta['reduction_rate'] = hp.reduction_rate

    model = Tacotron(inp, inp_mask, decode_time_steps, style_attention, hyper_params=hp)


    dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size)
    if not os.path.exists(generate_path):
        os.makedirs(generate_path)


    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model.sess = sess
        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
        ckpt = tf.train.get_checkpoint_state(save_path)

        saver = tf.train.Saver(max_to_keep=20)
        model.saver = saver
        ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim),
                                         dtype=tf.float32)
        ass_opt = model.single_style_token.assign(ass_style_token)
        if ckpt:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(save_path, ckpt_name))
            print('restore path:', ckpt_name)
        else:
            print('no restor, init all')

        # train_next_item = init_next_batch(tfrecord_train_path)
        dev_next_item = init_next_batch(tfrecord_dev_path, 600, 1)

        train_scalar_summary = model.get_scalar_summary('train')
        train_alpha_summary = model.get_alpha_summary('train', 2)
        random_num = random.randint(1, 10000000)

        for dev_i in range(dev_batches_per_epoch):
            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, batch_char_txt = get_next_batch(sess,
                                                                                            dev_next_item)
            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch(batch_inp,
                                                                                             batch_inp_mask,
                                                                                             batch_mel_gtruth,
                                                                                             batch_spec_gtruth,
                                                                                             dev_meta)
            # batch_char_txt = batch_char_txt.decode()
            for var in batch_char_txt:
                print(var.decode())

            # print(batch_char_txt)
            for style_no in range(72):
                unique_style_attention = get_style_attention(style_no)
                pred_out = sess.run(model.post_output, feed_dict={inp: batch_inp, inp_mask: batch_inp_mask, decode_time_steps: batch_mel_gtruth.shape[1] // hp.reduction_rate + 1, style_attention: unique_style_attention})
                pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta["log_stftm_mean"]

                for audio_i in range(12):
                    pred_audio, exp_spec = audio.invert_spectrogram(pred_out[audio_i], 1.2)
                    wav_folder = os.path.join(generate_path, "audio_%d" % (audio_i))
                    if not os.path.exists(wav_folder):
                        os.makedirs(wav_folder)
                    siowav.write(os.path.join(wav_folder, "a%d_style_%d.wav" % (random_num, style_no)), hp.sample_rate, pred_audio)



                # all_pred_out.append(pred_audio)

            # all_pred_out = np.concatenate(all_pred_out, axis=0)
            break
コード例 #5
0
def main():

    with tf.variable_scope('data'):
        inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32)
        inp_mask = tf.placeholder(name='inp_mask',
                                  shape=(None, ),
                                  dtype=tf.int32)
        decode_time_steps = tf.placeholder(name='decode_time_steps',
                                           shape=(),
                                           dtype=tf.int32)

    dev_meta_path = pkl_dev_path
    assert os.path.exists(dev_meta_path), \
        '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path)

    with open(dev_meta_path, 'rb') as f:
        dev_meta = pkl.load(f)
        dev_meta['reduction_rate'] = hp.reduction_rate

    model = Tacotron(inp, inp_mask, decode_time_steps, hyper_params=hp)

    dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size)
    if not os.path.exists(generate_path):
        os.makedirs(generate_path)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        model.sess = sess
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer()
        ])
        ckpt = tf.train.get_checkpoint_state(save_path)
        saver = tf.train.Saver()
        if ckpt:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(save_path, ckpt_name))
            print('restore path:', ckpt_name)
        else:
            print('no restor, init all')

        # train_next_item = init_next_batch(tfrecord_train_path)
        dev_next_item = init_next_batch(tfrecord_dev_path)

        with open(pkl_dev_path, "rb") as f:
            dev_stats = pkl.load(f)

        for dev_i in range(dev_batches_per_epoch):
            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch(
                sess, dev_next_item)
            batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch(
                batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth,
                dev_meta)
            pred_out, alpha_out = sess.run(
                [model.post_output, model.alpha_output],
                feed_dict={
                    inp: batch_inp,
                    inp_mask: batch_inp_mask,
                    decode_time_steps: 100
                })
            all_pred_out = []
            # generate general voice
            pred_out = pred_out * dev_stats["log_stftm_std"] + dev_stats[
                "log_stftm_mean"]
            for audio_i in range(8):
                pred_audio, exp_spec = audio.invert_spectrogram(
                    pred_out[audio_i], 1.2)
                siowav.write(
                    os.path.join(generate_path, "random%d.wav" % (audio_i)),
                    hp.sample_rate, pred_audio)
                # pred_audio = np.reshape(pred_audio, (1, pred_audio.shape[-1]))

                # all_pred_out.append(pred_audio)

            # all_pred_out = np.concatenate(all_pred_out, axis=0)
            break