def main(): logdir, ckpt = os.path.split(args.checkpoint) arch = tf.gfile.Glob(os.path.join( logdir, 'architecture*.json'))[0] # should only be 1 file with open(arch) as fp: arch = json.load(fp) normalizer = Tanhize( xmax=np.fromfile('./etc/xmax.npf'), xmin=np.fromfile('./etc/xmin.npf'), ) features = read_whole_features(args.file_pattern.format(args.src)) x = normalizer.forward_process(features['sp']) x = nh_to_nchw(x) y_s = features['speaker'] y_t_id = tf.placeholder(dtype=tf.int64, shape=[ 1, ]) y_t = y_t_id * tf.ones(shape=[ tf.shape(x)[0], ], dtype=tf.int64) machine = MODEL(arch) z = machine.encode(x) x_t = machine.decode(z, y_t) # NOTE: the API yields NHWC format x_t = tf.squeeze(x_t) x_t = normalizer.backward_process(x_t) # For sanity check (validation) x_s = machine.decode(z, y_s) x_s = tf.squeeze(x_s) x_s = normalizer.backward_process(x_s) f0_s = features['f0'] f0_t = convert_f0(f0_s, args.src, args.trg) output_dir = get_default_output(args.output_dir) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=output_dir) with sv.managed_session() as sess: load(saver, sess, logdir, ckpt=ckpt) while True: try: feat, f0, sp = sess.run( [features, f0_t, x_t], feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])}) feat.update({'sp': sp, 'f0': f0}) y = pw2wav(feat) oFilename = make_output_wav_name(output_dir, feat['filename']) sf.write(oFilename, y, FS) except: break
def main(): logdir, ckpt = os.path.split(args.checkpoint) arch = tf.gfile.Glob(os.path.join(logdir, 'architecture*.json'))[0] # should only be 1 file with open(arch) as fp: arch = json.load(fp) normalizer = Tanhize( xmax=np.fromfile('./etc/xmax.npf'), xmin=np.fromfile('./etc/xmin.npf'), ) features = read_whole_features(args.file_pattern.format(args.src)) x = normalizer.forward_process(features['sp']) x = nh_to_nchw(x) y_s = features['speaker'] y_t_id = tf.placeholder(dtype=tf.int64, shape=[1,]) y_t = y_t_id * tf.ones(shape=[tf.shape(x)[0],], dtype=tf.int64) machine = MODEL(arch) z = machine.encode(x) x_t = machine.decode(z, y_t) # NOTE: the API yields NHWC format x_t = tf.squeeze(x_t) x_t = normalizer.backward_process(x_t) # For sanity check (validation) x_s = machine.decode(z, y_s) x_s = tf.squeeze(x_s) x_s = normalizer.backward_process(x_s) f0_s = features['f0'] f0_t = convert_f0(f0_s, args.src, args.trg) output_dir = get_default_output(args.output_dir) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=output_dir) with sv.managed_session() as sess: load(saver, sess, logdir, ckpt=ckpt) while True: try: feat, f0, sp = sess.run( [features, f0_t, x_t], feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])} ) feat.update({'sp': sp, 'f0': f0}) y = pw2wav(feat) oFilename = make_output_wav_name(output_dir, feat['filename']) sf.write(oFilename, y, FS) except: break
def main(unused_args): if args.logdir is None: raise ValueError('Please specify the dir to the checkpoint') arch = tf.gfile.Glob(join(args.logdir, 'arch*.json'))[0] arch = json2dict(arch) net = VQVAE(arch) data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list), filenames=tf.gfile.Glob(args.file_pattern)) ZH = net.encode(data.x, args.mode) ema = tf.train.ExponentialMovingAverage(decay=0.995) trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()} saver = tf.train.Saver(trg_vars) sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.tables_initializer()) sess.run(data.iterator.initializer) sess.run(tf.global_variables_initializer()) load(saver, sess, args.logdir, ckpt=args.ckpt) hist = np.zeros([ arch['num_exemplar'], ], dtype=np.int64) counter = 1 while True: try: z_ids = sess.run(ZH) print('\rNum of processed files: {:d}'.format(counter), end='') counter += 1 for i in z_ids[0]: # bz = 1 hist[i] += 1 except tf.errors.OutOfRangeError: print() break with open('histogram.npf', 'wb') as fp: hist.tofile(fp) plt.figure(figsize=[10, 2]) plt.plot(np.log10(hist + 1), '.') plt.xlim([0, arch['num_exemplar'] - 1]) plt.ylabel('log-frequency') plt.xlabel('exemplar index') plt.savefig('histogram.png') plt.close()
def main(unused_args=None): # args(sys.argv) if args.model is None: raise ValueError( '\n You MUST specify `model`.' +\ '\n Use `python convert.py --help` to see applicable options.' ) module = import_module(args.module, package=None) MODEL = getattr(module, args.model) FS = 16000 with open(args.speaker_list) as fp: SPEAKERS = [l.strip() for l in fp.readlines()] logdir, ckpt = os.path.split(args.checkpoint) if 'VAE' in logdir: _path_to_arch, _ = os.path.split(logdir) else: _path_to_arch = logdir arch = tf.gfile.Glob(os.path.join(_path_to_arch, 'architecture*.json')) if len(arch) != 1: print('WARNING: found more than 1 architecture files!') arch = arch[0] with open(arch) as fp: arch = json.load(fp) normalizer = Tanhize( xmax=np.fromfile('./etc/{}_xmax.npf'.format(args.corpus_name)), xmin=np.fromfile('./etc/{}_xmin.npf'.format(args.corpus_name)), ) features = read_whole_features(args.file_pattern.format(args.src)) x = normalizer.forward_process(features['sp']) x = nh_to_nhwc(x) y_s = features['speaker'] y_t_id = tf.placeholder(dtype=tf.int64, shape=[ 1, ]) y_t = y_t_id * tf.ones(shape=[ tf.shape(x)[0], ], dtype=tf.int64) f0_t = features['f0'] # f0_t = convert_f0(f0_s, args.src, args.trg) # f0_s_convert = tf.cast(f0_s,dtype=tf.int64) f0_t_convert = tf.cast(f0_t, dtype=tf.int64) machine = MODEL(arch, is_training=False) z = machine.encode(x) x_t = machine.decode(z, y_t, f0_t_convert) # NOTE: the API yields NHWC format x_t = tf.squeeze(x_t) x_t = normalizer.backward_process(x_t) output_dir = get_default_output(args.output_dir) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=output_dir) with sv.managed_session() as sess: load(saver, sess, logdir, ckpt=ckpt) print() while True: try: s_time = time.perf_counter() feat, f0, sp = sess.run( [features, f0_t, x_t], feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])}) feat.update({'sp': sp, 'f0': f0}) y = pw2wav(feat) oFilename = make_output_wav_name(output_dir, feat['filename']) print('\rProcessing {}'.format(oFilename), end='') e_time = time.perf_counter() print('\n') print('Time_sp: {}'.format(e_time - s_time)) print('\n') sf.write(oFilename, y, FS) except KeyboardInterrupt: break finally: pass print()
def main(unused_args): if args.logdir is None: raise ValueError('Please specify the dir to the checkpoint') speaker_list = txt2list(args.speaker_list) arch = tf.gfile.Glob(os.path.join(args.logdir, 'arch*.json'))[0] arch = json2dict(arch) net = VQVAE(arch) # they start roughly at the same position but end very differently (3 is longest) filenames = [ 'dataset/VCTK/tfr/p227/p227_363.tfr', # 'dataset/VCTK/tfr/p240/p240_341.tfr', # 'dataset/VCTK/tfr/p243/p243_359.tfr', 'dataset/VCTK/tfr/p225/p225_001.tfr' ] data = ByteWavWholeReader(speaker_list, filenames) X = tf.placeholder(dtype=tf.int64, shape=[None, None]) Y = tf.placeholder(dtype=tf.int64, shape=[ None, ]) ZH = net.encode(X, args.mode) XH = net.generate(X, ZH, Y) # XWAV = mu_law_decode(X) # XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', arch['fs']) ema = tf.train.ExponentialMovingAverage(decay=0.995) trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()} saver = tf.train.Saver(trg_vars) logdir = get_default_logdir(args.logdir) tf.gfile.MkDir(logdir) sess_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=sess_config) as sess: sess.run(tf.tables_initializer()) sess.run(data.iterator.initializer) results = [] for _ in filenames: result = sess.run({'x': data.x, 'y': data.y}) results.append(result) # results1 = sess.run({'x': data.x, 'y': data.y}) # results2 = sess.run({'x': data.x, 'y': data.y}) length_input = net.n_padding() + 1 # same as padding + 1 ini = 15149 - length_input end = 42285 # x_source1 = results1['x'][:, ini: end] # x_source2 = results2['x'][:, ini: end] for i in range(len(results)): x = results[i]['x'] if x.shape[-1] < end: x = np.concatenate( [x, x[0, 0] + np.zeros([1, end - x.shape[-1]])], -1) results[i]['x'] = x[:, ini:end] # from pdb import set_trace # set_trace() x_source = np.concatenate([ results[0]['x'], results[0]['x'], results[1]['x'], results[1]['x'] ], 0) B = x_source.shape[0] y_input = np.concatenate([ results[0]['y'], results[1]['y'], results[1]['y'], results[0]['y'] ], 0) length_target = x_source.shape[1] - length_input while True: sess.run(tf.global_variables_initializer()) load(saver, sess, args.logdir, ckpt=args.ckpt) z_blend = sess.run(ZH, feed_dict={X: x_source}) x_input = x_source[:, :length_input] z_input = z_blend[:, :length_input, :] # Generate try: x_gen = np.zeros([B, length_target], dtype=np.int64) # + results['x'][0, 0] for i in range(length_target): xh = sess.run(XH, feed_dict={ X: x_input, ZH: z_input, Y: y_input }) z_input = z_blend[:, i + 1:i + 1 + length_input, :] x_input[:, :-1] = x_input[:, 1:] x_input[:, -1] = xh[:, -1] x_gen[:, i] = xh[:, -1] print('\rGenerating {:5d}/{:5d}... x={:3d}'.format( i + 1, length_target, xh[0, -1]), end='', flush=True) except KeyboardInterrupt: print("Interrupted by the user.") finally: print() x_wav = mu_law_decode(x_gen) for i in range(x_wav.shape[0]): x_1ch = np.expand_dims(x_wav[i], -1) # x_bin = sess.run(XBIN, feed_dict={X: x_1ch}) librosa.output.write_wav('testwav-{}.wav'.format(i), x_1ch, arch['fs']) # with open(os.path.join(logdir, 'testwav-{}.wav'.format(i)), 'wb') as fp: # fp.write(x_bin) # For periodic gen. if args.period > 0: try: print('Sleep for a while') sleep(args.period * 60) logdir = get_default_logdir(args.logdir) tf.gfile.MkDir(logdir) except KeyboardInterrupt: print('Stop periodic gen.') break finally: print('all finished') else: break
def main(unused_args=None): if args.model is None: raise ValueError( '\n You MUST specify `model`.' +\ '\n Use `python convert.py --help` to see applicable options.' ) module = import_module(args.module_original, package=None) MODEL = getattr(module, args.model) FS = 16000 with open(args.speaker_list) as fp: SPEAKERS = [l.strip() for l in fp.readlines()] logdir_f0, ckpt_f0_cwt = os.path.split(args.checkpoint_f0_cwt) # f0: if 'VAE' in logdir_f0: _path_to_arch, _ = os.path.split(logdir_f0) else: _path_to_arch = logdir_f0 arch_f0 = tf.gfile.Glob(os.path.join(_path_to_arch, 'architecture*.json')) if len(arch_f0) != 1: print('WARNING: found more than 1 architecture files!') arch_f0 = arch_f0[0] with open(arch_f0) as fp: arch_f0 = json.load(fp) features = read_whole_features(args.file_pattern.format(args.src)) f0_cwt = features['lf0_cwt_norm'] f0_cwt = nh_to_nhwc(f0_cwt) y_s_f0 = features['speaker'] y_t_id_f0 = tf.placeholder(dtype=tf.int64, shape=[ 1, ]) y_t_f0 = y_t_id_f0 * tf.ones(shape=[ tf.shape(f0_cwt)[0], ], dtype=tf.int64) if not os.path.isdir('./f0_results'): os.mkdir('./f0_results') # convert f0: machine_f0 = MODEL(arch_f0, is_training=False) z_f0 = machine_f0.encode(f0_cwt) f0_cwt_t = machine_f0.decode(z_f0, y_t_f0) # NOTE: the API yields NHWC format f0_cwt_t = tf.squeeze(f0_cwt_t) output_dir = get_default_output(args.output_dir) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=output_dir) with sv.managed_session() as sess: load(saver, sess, logdir_f0, ckpt=ckpt_f0_cwt) print() while True: try: feat, lf0_cwt = sess.run([features, f0_cwt_t], feed_dict={ y_t_id_f0: np.asarray( [SPEAKERS.index(args.trg)]) }) feat.update({'lf0_cwt': lf0_cwt}) feats = dic2npy(feat) oFilename = make_output_bin_name(output_dir, feat['filename']) with open(join('./f0_results', '{}.bin'.format(oFilename)), 'wb') as fp: fp.write(feats.tostring()) except KeyboardInterrupt: break finally: pass print()
def main(): ''' Note: 1. The input is rescaled to [-1, 1] (img_reader: rtype) ''' dirs = validate_log_dirs(args) coord = tf.train.Coordinator() with open(args.architecture) as f: arch = json.load(f) imgs, info = img_reader(datadir=args.datadir, img_dims=arch['hwc'], batch_size=args.batch_size, rtype='tanh') machine = VAEGAN(arch, is_training=True) loss = machine.loss(imgs) xh = machine.sample(args.batch_size) x_interp = machine.interpolate(imgs[0], imgs[1], N_INTERP) opt_d, opt_g, opt_e = get_optimization_ops(loss, args, arch['mode']) # # ========== For embedding ============= # h, w, c = arch['hwc'] # img4em = tf.Variable( # np.reshape( # np.fromfile( # SPRITE_NUMPY_FILE, np.float32), # [N_VISUALIZE, h, w, c]), # name='emb_input_img') # codes = machine.encode(img4em) # em_var = tf.Variable( # tf.zeros((N_VISUALIZE, arch['z_dim'])), # name='embeddings') # # ====================================== writer = tf.train.SummaryWriter(dirs['logdir']) writer.add_graph(tf.get_default_graph()) summary_op = tf.merge_all_summaries() with open(os.path.join(dirs['logdir'], args.architecture), 'w') as f: json.dump(arch, f) if args.gpu_cfg: with open(args.gpu_cfg) as f: cfg = json.load(f) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg[ 'per_process_gpu_memory_fraction']) session_conf = tf.ConfigProto( allow_soft_placement=cfg['allow_soft_placement'], log_device_placement=cfg['log_device_placement'], inter_op_parallelism_threads=cfg['inter_op_parallelism_threads'], intra_op_parallelism_threads=cfg['intra_op_parallelism_threads'], gpu_options=gpu_options) sess = tf.Session(config=session_conf) else: sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() # tf.global_variables() try: saved_global_step = load(saver, sess, dirs['restore_from']) if saved_global_step is None: saved_global_step = -1 except: print("Something's wrong while restoing checkpoints!") raise threads = tf.train.start_queue_runners(sess=sess, coord=coord) # # ========== For embedding ============= # ass_op = tf.assign(em_var, codes['mu'], name='X/em_var') # config = projector.ProjectorConfig() # embedding = config.embeddings.add() # embedding.tensor_name = em_var.name # print(em_var.name, em_var.get_shape()) # embedding.sprite.image_path = PATH_TO_SPRITE_IMAGE # embedding.sprite.single_image_dim.extend([w, h]) # embedding.metadata_path = PATH_TO_LABEL # projector.visualize_embeddings(writer, config) # # ===================================== # ========== Actual training loop ========== try: n_iter_per_epoch = info['n_files'] // args.batch_size time_i = time.time() step = 0 for ep in range(args.n_epoch): for it in range(n_iter_per_epoch): _, l_df, l_dr = sess.run( [opt_d, loss['D_fake'], loss['D_real']]) # Update G twice _, l_g = sess.run([opt_g, loss['G_fake']]) _, l_g = sess.run([opt_g, loss['G_fake']]) if arch['mode'] == 'VAE-GAN': _, l_e, l_dis = sess.run( [opt_e, loss['KL(z)'], loss['Dis']]) # Message msg = 'Epoch [{:3d}/{:3d}] '.format(ep + 1, args.n_epoch)\ + '[{:4d}/{:4d}] '.format(it + 1, n_iter_per_epoch)\ + 'd_loss={:6.3f}+{:6.3f}, '.format(l_df, l_dr)\ + 'g_loss={:5.2f}, '.format(l_g) if arch['mode'] == 'VAE-GAN': msg += 'KLD={:6.3f}, DIS={:6.3f}, '.format(l_e, l_dis) msg += 'T={:.2f}'.format(time.time() - time_i) print(msg) # writer.add_summary(summary, step) # Demo/Output if it % (n_iter_per_epoch // 1) == 0: summary = sess.run(summary_op) writer.add_summary(summary, step) if arch['mode'] == 'VAE-GAN': visualize_interpolation( sess, x_interp, filename=os.path.join( dirs['logdir'], 'test-Ep{:03d}-It{:04d}.png'.format(ep, it))) # sess.run(ass_op) visualize_random_samples( sess, xh, filename=os.path.join( dirs['logdir'], 'test-Ep{:03d}-It{:04d}-dc.png'.format(ep, it))) save(saver, sess, dirs['logdir'], step) step += 1 except KeyboardInterrupt: print() finally: save(saver, sess, dirs['logdir'], step) coord.request_stop() coord.join(threads)
def train(self, data): hyperp = self.arch['training'] loss = self.loss(data.x, data.y) opt = self._optimize(loss) Z = self._Enc(self._D2A(data.x)) update_encoding = tf.assign(self.encodings, self.encoding_placeholder) K, D = self.arch['num_exemplar'], self.arch['dim_exemplar'] z_emp = tf.placeholder(tf.float32, [K, D], 'z_emp') init_z_emb = tf.assign(self.z_emb, z_emp) ema = opt['ema'] trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()} saver = tf.train.Saver(trg_vars) sess_config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True) ) scaffold = tf.train.Scaffold( local_init_op=tf.group( tf.local_variables_initializer(), data.iterator.initializer, tf.tables_initializer() ) ) with tf.train.MonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=self.arch['logdir'], save_checkpoint_secs=360, save_summaries_secs=120, config=sess_config, ) as sess: dummy_path = self._make_dummy_tsv() visualize_embeddings( logdir=self.arch['logdir'], var_list=[self.y_emb, self.encodings], tsv_list=['etc/speakers_label.tsv', dummy_path], ) if self.arch['restore_from']: load(saver, sess, self.arch['restore_from'], ckpt=self.arch['ckpt']) # ========== Initialize exemplars with Enc output ========== multiplier = 100 exe = np.zeros([0, D]) while exe.shape[0] < K * multiplier: z = sess.run(Z) exe = np.concatenate([exe, np.reshape(z, [-1, D])], 0) np.unique(exe, axis=0) np.random.shuffle(exe) sess.run(init_z_emb, feed_dict={z_emp: exe[:K, :]}) # ========== Main training loop ========== maxIter = hyperp['maxIter'] for it in range(maxIter): sess.run(opt['trn']) if it % hyperp['refresh_freq'] == 1: self._get_and_update_encodings(sess, Z, data.y, update_encoding) fetches = {'l': loss['reconst']} results = sess.run(fetches) print('\rIter {:5d}: loss = {:.4e}'.format(it, results['l']), end='') print()