def main(): with tf.variable_scope('data'): inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32) inp_mask = tf.placeholder(name='inp_mask', shape=(None, ), dtype=tf.int32) seq2seq_gtruth = tf.placeholder(name='seq2seq_gtruth', shape=(None, None, hp.seq2seq_dim), dtype=tf.float32) post_gtruth = tf.placeholder(name='post_gtruth', shape=(None, None, hp.post_dim), dtype=tf.float32) train_meta_path = pkl_train_path assert os.path.exists(train_meta_path),\ '[!] Train meta not exists! PATH: {}'.format(train_meta_path) dev_meta_path = pkl_dev_path assert os.path.exists(dev_meta_path), \ '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path) with open(train_meta_path, 'rb') as f: train_meta = pkl.load(f) train_meta['reduction_rate'] = hp.reduction_rate with open(dev_meta_path, 'rb') as f: dev_meta = pkl.load(f) dev_meta['reduction_rate'] = hp.reduction_rate train_model = Tacotron(inp=inp, inp_mask=inp_mask, seq2seq_gtruth=seq2seq_gtruth, post_gtruth=post_gtruth, hyper_params=hp, training=True, reuse=False) with tf.variable_scope('optimizer'): opt = tf.train.AdamOptimizer( train_model.exp_learning_rate_decay(0.001)) # grad, var = zip(*opt.compute_gradients(train_model.loss)) # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # train_upd = opt.apply_gradients(zip(grad, var), global_step=train_model.global_step) grads_and_vars = opt.compute_gradients(train_model.loss) for i, (grad, var) in enumerate(grads_and_vars): # print(var.name) if var.name.find('style_token:0') != -1: grads_and_vars[i] = (grad * 0, var) print(var.name) print('hhhh time') break with tf.control_dependencies(tf.get_collection( tf.GraphKeys.UPDATE_OPS)): train_upd = opt.apply_gradients( grads_and_vars, global_step=train_model.global_step) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(tb_logs_path): os.makedirs(tb_logs_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_model.sess = sess writer = tf.summary.FileWriter(tb_logs_path, filename_suffix='train') sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) ckpt = tf.train.get_checkpoint_state(save_path) saver = tf.train.Saver(max_to_keep=20) train_model.saver = saver ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim), dtype=tf.float32) ass_opt = train_model.single_style_token.assign(ass_style_token) if ckpt: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(save_path, ckpt_name)) print('restore path:', ckpt_name) else: print('no restor, init all include style:') # np.random.seed(1) init_style_token = np.random.uniform(low=-1, high=1, size=(1, hp.styles_kind, hp.style_dim)) print('look random:', np.max(init_style_token), np.min(init_style_token)) sess.run(ass_opt, feed_dict={ass_style_token: init_style_token}) train_next_item = init_next_batch(tfrecord_train_path, 7001, 2000) # dev_next_item = init_next_batch(tfrecord_dev_path, 1, 2000) train_scalar_summary = train_model.get_scalar_summary('train') train_alpha_summary = train_model.get_alpha_summary('train', 2) dev_loss_holder = tf.placeholder(shape=(), dtype=tf.float32, name='dev_loss') dev_loss_summary = tf.summary.scalar('dev_loss_summary', dev_loss_holder) pred_audio_holder = tf.placeholder(shape=(None, None), dtype=tf.float32, name='pred_audio') pred_audio_summary = tf.summary.audio('pred_audio_summary', pred_audio_holder, sample_rate=hp.sample_rate, max_outputs=30) already_step_eval = sess.run(train_model.global_step) try: for cnt in tqdm.tqdm( range(already_step_eval + 1, hp.max_global_steps + 10)): # print('now is', cnt) pre_time = time.time() batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch( sess, train_next_item) # print('bug', batch_inp[0], 'len', batch_inp_mask[0], 'actual', batch_inp[0].shape) batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch( batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, train_meta) # print(batch_mel_gtruth.shape[1], batch_inp[0][0]) # print(batch_inp_mask) # print('look', batch_mel_gtruth[0], batch_spec_gtruth[0]) train_time = time.time() # print('pre time:', train_time - pre_time) _, loss_eval, global_step_eval = sess.run( [train_upd, train_model.loss, train_model.global_step], feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, seq2seq_gtruth: batch_mel_gtruth, post_gtruth: batch_spec_gtruth }) # print('step:', global_step_eval) if cnt % 50 == 0: # if cnt % 5 == 0: summary_str = sess.run(train_scalar_summary, feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, seq2seq_gtruth: batch_mel_gtruth, post_gtruth: batch_spec_gtruth }) writer.add_summary(summary_str, global_step_eval) if cnt % 200 == 0: #about one epoch # if cnt % 10 == 0:#about one epoch summary_str = sess.run(train_alpha_summary, feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, seq2seq_gtruth: batch_mel_gtruth, post_gtruth: batch_spec_gtruth }) writer.add_summary(summary_str, global_step_eval) dev_loss = 0 dev_batches_per_epoch = 0 dev_next_item = init_next_batch( tfrecord_dev_path, 1000, 1) #use the last batch to listen feel while True: try: batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch( sess, dev_next_item) batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch( batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, dev_meta) _loss = sess.run(train_model.loss, feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, seq2seq_gtruth: batch_mel_gtruth, post_gtruth: batch_spec_gtruth }) dev_loss += _loss dev_batches_per_epoch += 1 except: dev_loss /= dev_batches_per_epoch dev_loss_summary_str = sess.run( dev_loss_summary, feed_dict={dev_loss_holder: dev_loss}) writer.add_summary(dev_loss_summary_str, global_step_eval) break if cnt % 2000 == 0: # if cnt % 15 == 0: train_model.save(save_path, global_step_eval) all_pred_out = [] trained_style_token = sess.run( train_model.single_style_token) for style_no in range(11): unique_style_token = get_style_token( trained_style_token, style_no) sess.run( ass_opt, feed_dict={ass_style_token: unique_style_token}) pred_out = sess.run(train_model.post_output, feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, seq2seq_gtruth: batch_mel_gtruth, post_gtruth: batch_spec_gtruth }) pred_out = pred_out * train_meta[ "log_stftm_std"] + train_meta["log_stftm_mean"] for audio_i in range(3): pred_audio, exp_spec = audio.invert_spectrogram( pred_out[audio_i], 1.2) pred_audio = np.reshape(pred_audio, (1, pred_audio.shape[-1])) all_pred_out.append(pred_audio) inp_all_pred_out = [] for m in range(3): for x in range(30): if x % 3 == m: inp_all_pred_out.append(all_pred_out[x]) all_pred_out = np.concatenate(inp_all_pred_out, axis=0) pred_audio_summary_str = sess.run( pred_audio_summary, feed_dict={pred_audio_holder: all_pred_out}) writer.add_summary(pred_audio_summary_str, global_step_eval) sess.run(ass_opt, feed_dict={ass_style_token: trained_style_token}) post_time = time.time() # print('train time:', post_time - train_time) except Exception as e: print('Training stopped', str(e))
def main(): with tf.variable_scope('data'): inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32) inp_mask = tf.placeholder(name='inp_mask', shape=(None, ), dtype=tf.int32) decode_time_steps = tf.placeholder(name='decode_time_steps', shape=(), dtype=tf.int32) ctr_flag = tf.placeholder(name='ctr_flag', shape=(), dtype=tf.int32) style_attention = tf.placeholder(name='style_att', shape=(None, 10), dtype=tf.float32) dev_meta_path = pkl_dev_path assert os.path.exists(dev_meta_path), \ '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path) with open(dev_meta_path, 'rb') as f: dev_meta = pkl.load(f) dev_meta['reduction_rate'] = hp.reduction_rate print(dev_meta.keys()) dev_char_map = dev_meta['char_map'] txt = [ "She glanced at his newspaper, then stopped and stared.", "I think you'll have to marry Count Paris.", "My house is the best of all!" ] # print('**', txt[0][0]) max_txt_len = 0 for i in range(len(txt)): max_txt_len = max(max_txt_len, len(txt[i])) txt_inp = [] for i in range(len(txt)): txt_inp_a = [] for j in range(len(txt[i])): # print('---:', txt[i][j]) txt_inp_a.append(dev_char_map[txt[i][j]]) for j in range(len(txt[i]), max_txt_len): txt_inp_a.append(0) txt_inp.append(txt_inp_a) txt_inp = np.asarray(txt_inp) # print(txt_inp) txt_mask = [] for i in range(len(txt)): txt_mask.append(len(txt[i])) txt_mask = np.asarray(txt_mask) # print(txt_mask) model = Tacotron(inp, inp_mask, decode_time_steps, ctr_flag, style_attention, hyper_params=hp) dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size) if not os.path.exists(generate_path): os.makedirs(generate_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model.sess = sess sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) ckpt = tf.train.get_checkpoint_state(save_path) saver = tf.train.Saver(max_to_keep=20) model.saver = saver ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim), dtype=tf.float32) ass_opt = model.single_style_token.assign(ass_style_token) if ckpt: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(save_path, ckpt_name)) print('restore path:', ckpt_name) else: print('no restor, init all') #no ctr unique_style_attention = np.zeros([len(txt_inp), 10], dtype=np.float32) pred_out = sess.run(model.post_output, feed_dict={ inp: txt_inp, inp_mask: txt_mask, decode_time_steps: 60, ctr_flag: 0, style_attention: unique_style_attention }) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[ "log_stftm_mean"] for j in range(len(txt_inp)): pred_audio, exp_spec = audio.invert_spectrogram(pred_out[j], 1.2) wav_folder = os.path.join(generate_path, data_name) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write( os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, 100)), hp.sample_rate, pred_audio) #ctr, no style unique_style_attention = np.zeros([len(txt_inp), 10], dtype=np.float32) pred_out = sess.run(model.post_output, feed_dict={ inp: txt_inp, inp_mask: txt_mask, decode_time_steps: 60, ctr_flag: 1, style_attention: unique_style_attention }) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[ "log_stftm_mean"] for j in range(len(txt_inp)): pred_audio, exp_spec = audio.invert_spectrogram(pred_out[j], 1.2) wav_folder = os.path.join(generate_path, data_name) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write( os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, 200)), hp.sample_rate, pred_audio) #ctr, spec style for i in range(10): unique_style_attention = np.zeros([len(txt_inp), 10], dtype=np.float32) for j in range(len(txt_inp)): unique_style_attention[j][i] = 1 pred_out = sess.run(model.post_output, feed_dict={ inp: txt_inp, inp_mask: txt_mask, decode_time_steps: 60, ctr_flag: 1, style_attention: unique_style_attention }) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[ "log_stftm_mean"] for j in range(len(txt_inp)): pred_audio, exp_spec = audio.invert_spectrogram( pred_out[j], 1.2) wav_folder = os.path.join(generate_path, data_name) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write( os.path.join(wav_folder, "audio%d_style_%d.wav" % (j, i)), hp.sample_rate, pred_audio)
def main(): with tf.variable_scope('data'): inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32) inp_mask = tf.placeholder(name='inp_mask', shape=(None, ), dtype=tf.int32) inp_id = tf.placeholder(name='inp_id', shape=(None, ), dtype=tf.int32) decode_time_steps = tf.placeholder(name='decode_time_steps', shape=(), dtype=tf.int32) # seq2seq_gtruth = tf.placeholder(name='seq2seq_gtruth', shape=(None, None, hp.seq2seq_dim), dtype=tf.float32) # post_gtruth = tf.placeholder(name='post_gtruth', shape=(None, None, hp.post_dim), dtype=tf.float32) dev_meta_path = pkl_dev_path assert os.path.exists(dev_meta_path), \ '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path) with open(dev_meta_path, 'rb') as f: dev_meta = pkl.load(f) dev_meta['reduction_rate'] = hp.reduction_rate print(dev_meta.keys()) dev_char_map = dev_meta['char_map'] # txt = ["She glanced at his newspaper, then stopped and stared.", # "I think you'll have to marry Count Paris.", # "My house is the best of all!"] txt = [ "She glanced at his newspaper, then stopped and stared.", "I think you'll have to marry Count Paris.", "My house is the best of all!" ] # print('**', txt[0][0]) max_txt_len = 0 for i in range(len(txt)): max_txt_len = max(max_txt_len, len(txt[i])) txt_inp = [] for i in range(len(txt)): txt_inp_a = [] for j in range(len(txt[i])): # print('---:', txt[i][j]) txt_inp_a.append(dev_char_map[txt[i][j]]) for j in range(len(txt[i]), max_txt_len): txt_inp_a.append(0) txt_inp.append(txt_inp_a) txt_inp = np.asarray(txt_inp) # print(txt_inp) txt_mask = [] for i in range(len(txt)): txt_mask.append(len(txt[i])) txt_mask = np.asarray(txt_mask) # print(txt_mask) model = Tacotron(inp, inp_mask, inp_id, decode_time_steps, hyper_params=hp) dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size) if not os.path.exists(generate_path): os.makedirs(generate_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model.sess = sess sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) ckpt = tf.train.get_checkpoint_state(save_path) saver = tf.train.Saver(max_to_keep=20) model.saver = saver # ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim), # dtype=tf.float32) # ass_opt = model.single_style_token.assign(ass_style_token) # ass_inp_att = tf.placeholder(name="ass_inp_att", shape=(None, hp.styles_kind), # dtype=tf.float32) # att_ass_opt = model.inp_att.assign(ass_inp_att) if ckpt: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(save_path, ckpt_name)) print('restore path:', ckpt_name) else: print('no restor, init all') wav_folder = os.path.join(generate_path, data_name) with open('selected_data_10.pkl', 'rb') as f: selected_data = pkl.load(f) # print(selected_data['batch_id'], selected_data['batch_key']) # return print('no ctr') for id_no in range(len(selected_data['batch_id'])): now_id_no = selected_data['batch_id'][id_no] for i in range(len(selected_data['batch_id'])): pred_out = sess.run(model.post_output, feed_dict={ inp: [selected_data['batch_inp'][i]], inp_mask: [selected_data['batch_inp_mask'][i]], inp_id: [now_id_no], decode_time_steps: 60 }) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[ "log_stftm_mean"] for j in range(1): pred_audio, exp_spec = audio.invert_spectrogram( pred_out[j], 1.2) # wav_folder = os.path.join(generate_path, data_name) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write( os.path.join( wav_folder, "%s_audio%d_style_%d.wav" % (selected_data['batch_char_txt'][i][:3], selected_data['batch_id'][i], now_id_no)), hp.sample_rate, pred_audio) pred_out = sess.run(model.post_output, feed_dict={ inp: txt_inp, inp_mask: txt_mask, inp_id: np.ones(len(txt)) * now_id_no, decode_time_steps: 60, }) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta[ "log_stftm_mean"] for j in range(len(txt)): pred_audio, exp_spec = audio.invert_spectrogram( pred_out[j], 1.2) # wav_folder = os.path.join(generate_path, data_name) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write( os.path.join(wav_folder, "test_audio%d_style_%d.wav" % (j, now_id_no)), hp.sample_rate, pred_audio) '''
def main(): with tf.variable_scope('data'): inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32) inp_mask = tf.placeholder(name='inp_mask', shape=(None,), dtype=tf.int32) decode_time_steps = tf.placeholder(name='decode_time_steps', shape=(), dtype=tf.int32) style_attention = tf.placeholder(name='style_att', shape=(None, 10), dtype=tf.float32) dev_meta_path = pkl_dev_path assert os.path.exists(dev_meta_path), \ '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path) with open(dev_meta_path, 'rb') as f: dev_meta = pkl.load(f) dev_meta['reduction_rate'] = hp.reduction_rate model = Tacotron(inp, inp_mask, decode_time_steps, style_attention, hyper_params=hp) dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size) if not os.path.exists(generate_path): os.makedirs(generate_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model.sess = sess sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) ckpt = tf.train.get_checkpoint_state(save_path) saver = tf.train.Saver(max_to_keep=20) model.saver = saver ass_style_token = tf.placeholder(name="ass_style_token", shape=(1, hp.styles_kind, hp.style_dim), dtype=tf.float32) ass_opt = model.single_style_token.assign(ass_style_token) if ckpt: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(save_path, ckpt_name)) print('restore path:', ckpt_name) else: print('no restor, init all') # train_next_item = init_next_batch(tfrecord_train_path) dev_next_item = init_next_batch(tfrecord_dev_path, 600, 1) train_scalar_summary = model.get_scalar_summary('train') train_alpha_summary = model.get_alpha_summary('train', 2) random_num = random.randint(1, 10000000) for dev_i in range(dev_batches_per_epoch): batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, batch_char_txt = get_next_batch(sess, dev_next_item) batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch(batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, dev_meta) # batch_char_txt = batch_char_txt.decode() for var in batch_char_txt: print(var.decode()) # print(batch_char_txt) for style_no in range(72): unique_style_attention = get_style_attention(style_no) pred_out = sess.run(model.post_output, feed_dict={inp: batch_inp, inp_mask: batch_inp_mask, decode_time_steps: batch_mel_gtruth.shape[1] // hp.reduction_rate + 1, style_attention: unique_style_attention}) pred_out = pred_out * dev_meta["log_stftm_std"] + dev_meta["log_stftm_mean"] for audio_i in range(12): pred_audio, exp_spec = audio.invert_spectrogram(pred_out[audio_i], 1.2) wav_folder = os.path.join(generate_path, "audio_%d" % (audio_i)) if not os.path.exists(wav_folder): os.makedirs(wav_folder) siowav.write(os.path.join(wav_folder, "a%d_style_%d.wav" % (random_num, style_no)), hp.sample_rate, pred_audio) # all_pred_out.append(pred_audio) # all_pred_out = np.concatenate(all_pred_out, axis=0) break
def main(): with tf.variable_scope('data'): inp = tf.placeholder(name='inp', shape=(None, None), dtype=tf.int32) inp_mask = tf.placeholder(name='inp_mask', shape=(None, ), dtype=tf.int32) decode_time_steps = tf.placeholder(name='decode_time_steps', shape=(), dtype=tf.int32) dev_meta_path = pkl_dev_path assert os.path.exists(dev_meta_path), \ '[!] Dev meta not exists! PATH: {}'.format(dev_meta_path) with open(dev_meta_path, 'rb') as f: dev_meta = pkl.load(f) dev_meta['reduction_rate'] = hp.reduction_rate model = Tacotron(inp, inp_mask, decode_time_steps, hyper_params=hp) dev_batches_per_epoch = math.ceil(len(dev_meta['key_lst']) / hp.batch_size) if not os.path.exists(generate_path): os.makedirs(generate_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: model.sess = sess sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) ckpt = tf.train.get_checkpoint_state(save_path) saver = tf.train.Saver() if ckpt: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(save_path, ckpt_name)) print('restore path:', ckpt_name) else: print('no restor, init all') # train_next_item = init_next_batch(tfrecord_train_path) dev_next_item = init_next_batch(tfrecord_dev_path) with open(pkl_dev_path, "rb") as f: dev_stats = pkl.load(f) for dev_i in range(dev_batches_per_epoch): batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = get_next_batch( sess, dev_next_item) batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth = post_next_batch( batch_inp, batch_inp_mask, batch_mel_gtruth, batch_spec_gtruth, dev_meta) pred_out, alpha_out = sess.run( [model.post_output, model.alpha_output], feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, decode_time_steps: 100 }) all_pred_out = [] # generate general voice pred_out = pred_out * dev_stats["log_stftm_std"] + dev_stats[ "log_stftm_mean"] for audio_i in range(8): pred_audio, exp_spec = audio.invert_spectrogram( pred_out[audio_i], 1.2) siowav.write( os.path.join(generate_path, "random%d.wav" % (audio_i)), hp.sample_rate, pred_audio) # pred_audio = np.reshape(pred_audio, (1, pred_audio.shape[-1])) # all_pred_out.append(pred_audio) # all_pred_out = np.concatenate(all_pred_out, axis=0) break