def main(args): os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) print('\n' + '=' * 30 + ' ARGUMENTS ' + '=' * 30) sys.stdout.flush() if args.resume: params = myutils.load_params(args.model_dir) args.encoders = params.encoders args.separation = params.separation args.ambi_order = params.ambi_order args.audio_rate = params.audio_rate args.video_rate = params.video_rate args.context = params.context args.sample_dur = params.sample_dur else: myutils.save_params(args) myutils.print_params(args) # Feeder min_t = min([args.context, args.sample_dur, 1. / args.video_rate]) args.video_rate = int(1. / min_t) with tf.device('/cpu:0'), tf.variable_scope('feeder'): feeder = Feeder(args.db_dir, subset_fn=args.subset_fn, ambi_order=args.ambi_order, audio_rate=args.audio_rate, video_rate=args.video_rate, context=args.context, duration=args.sample_dur, return_video=VIDEO in args.encoders, img_prep=myutils.img_prep_fcn(), return_flow=FLOW in args.encoders, frame_size=(224, 448), queue_size=args.batch_size * 5, n_threads=4, for_eval=False) batches = feeder.dequeue(args.batch_size) ambix_batch = batches['ambix'] video_batch = batches['video'] if 'video' in args.encoders else None flow_batch = batches['flow'] if 'flow' in args.encoders else None audio_mask_batch = batches['audio_mask'] t = int(args.audio_rate * args.sample_dur) ss = int(args.audio_rate * args.context) / 2 n_chann_in = args.ambi_order**2 audio_input = ambix_batch[:, :, :n_chann_in] audio_target = ambix_batch[:, ss:ss + t, n_chann_in:] print('\n' + '=' * 20 + ' MODEL ' + '=' * 20) sys.stdout.flush() with tf.device('/gpu:0'): # Model num_sep = args.num_sep_tracks if args.separation != NO_SEPARATION else 1 params = SptAudioGenParams(sep_num_tracks=num_sep, ctx_feats_fc_units=args.context_units, loc_fc_units=args.loc_units, sep_freq_mask_fc_units=args.freq_mask_units, sep_fft_window=args.fft_window) model = SptAudioGen(ambi_order=args.ambi_order, audio_rate=args.audio_rate, video_rate=args.video_rate, context=args.context, sample_duration=args.sample_dur, encoders=args.encoders, separation=args.separation, params=params) ambix_pred = model.inference_ops(audio=audio_input, video=video_batch, flow=flow_batch, is_training=True) # Losses and evaluation metrics print(audio_mask_batch) with tf.variable_scope('metrics'): metrics_t, _, _, _, _ = model.evaluation_ops( ambix_pred, audio_target, audio_input[:, ss:ss + t], mask_channels=audio_mask_batch[:, args.ambi_order**2:]) step_t = tf.Variable(0, trainable=False, name='step') with tf.variable_scope('loss'): loss_t = model.loss_ops(metrics_t, step_t) losses_t = {l: loss_t[l] for l in loss_t} regularizers = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularizers and 'regularization' in losses_t: losses_t['regularization'] = tf.add_n(regularizers) losses_t['total_loss'] = tf.add_n(losses_t.values()) # Optimizer update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.variable_scope('optimization') and tf.control_dependencies( update_ops): train_op, lr_t = myutils.optimize(losses_t['total_loss'], step_t, args) # Initialization rest_ops = model.init_ops init_op = [ tf.global_variables_initializer(), tf.local_variables_initializer() ] saver = tf.train.Saver(max_to_keep=1) # Tensorboard metrics_t['training_loss'] = losses_t['total_loss'] metrics_t['queue'] = feeder.queue_state metrics_t['lr'] = lr_t myutils.add_scalar_summaries(metrics_t.values(), metrics_t.keys()) summary_ops = tf.summary.merge( tf.get_collection(tf.GraphKeys.SUMMARIES)) summary_writer = tf.summary.FileWriter(args.model_dir, flush_secs=30) #summary_writer.add_graph(tf.get_default_graph()) print('\n' + '=' * 30 + ' VARIABLES ' + '=' * 30) model_vars = tf.global_variables() import numpy as np for v in model_vars: if 'Adam' in v.op.name.split('/')[-1]: continue print(' * {:50s} | {:20s} | {:7s} | {:10s}'.format( v.op.name, str(v.get_shape()), str(np.prod(v.get_shape())), str(v.dtype))) print('\n' + '=' * 30 + ' TRAINING ' + '=' * 30) sys.stdout.flush() config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=config) as sess: print('Initializing network...') sess.run(init_op) if rest_ops: sess.run(rest_ops) print('Initializing data feeders...') coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord) feeder.start_threads(sess) tf.get_default_graph().finalize() # Restore model init_step = 0 if args.resume: print('Restoring previously saved model...') ckpt = tf.train.latest_checkpoint(args.model_dir) if ckpt: saver.restore(sess, ckpt) init_step = sess.run(step_t) try: print('Start training...') duration = deque(maxlen=20) for step in range(init_step, args.n_iters): start_time = time.time() if step % 20 != 0: sess.run(train_op) else: outs = sess.run( [train_op, summary_ops, losses_t['total_loss']] + losses_t.values() + metrics_t.values()) if math.isnan(outs[2]): raise ValueError( 'Training produced a NaN metric or loss.') duration.append(time.time() - start_time) if step % 20 == 0: # Print progress to terminal and tensorboard myutils.print_stats(outs[3:], losses_t.keys() + metrics_t.keys(), args.batch_size, duration, step, tag='TRAIN') summary_writer.add_summary(outs[1], step) sys.stdout.flush() if step % 5000 == 0 and step != 0: # Save checkpoint saver.save(sess, args.model_dir + '/model.ckpt', global_step=step_t) print('=' * 60 + '\nCheckpoint saved\n' + '=' * 60) except Exception, e: print(str(e)) finally:
class WaveGlow(): def __init__(self, sess): self.sess = sess self.build_model() def build_model(self): self.global_step = tf.get_variable('global_step', initializer=0, dtype=tf.int32, trainable=False) self.mels = tf.placeholder(tf.float32, [None, args.n_mel, None]) self.wavs = tf.placeholder( tf.float32, [None, args.squeeze_size, args.wav_time_step // args.squeeze_size]) self.placeholders = [self.mels, self.wavs] self.conditions = mydeconv1d(self.mels, args.n_mel, filter_size=args.step_per_mel * 4, stride=args.step_per_mel, scope='upsample', reuse=False) self.conditions = tf.transpose(self.conditions, perm=[0, 2, 1]) self.conditions = tf.reshape(self.conditions, [ -1, tf.shape(self.conditions)[1] // args.squeeze_size, args.squeeze_size * args.n_mel ]) self.conditions = tf.transpose(self.conditions, perm=[0, 2, 1]) self.z = [] self.layer = self.wavs self.logdets, self.logss = 0, 0 for i in range(args.n_flows): self.layer, logs, logdet = conv_afclayer(self.layer, self.conditions, reverse=False, scope='afc_' + str(i + 1), reuse=False) self.logdets += logdet self.logss += logs if (i + 1) % args.early_output_every == 0 and (i + 1) != args.n_flows: self.z.append(self.layer[:, :args.early_output_size]) self.layer = self.layer[:, args.early_output_size:] self.z.append(self.layer) self.z = tf.concat(self.z, axis=1) total_size = tf.cast(tf.size(self.z), tf.float32) self.logdet_loss = -tf.reduce_sum(self.logdets) / total_size self.logs_loss = -tf.reduce_sum(self.logss) / total_size self.prior_loss = tf.reduce_sum(self.z**2 / (2 * args.sigma**2)) / total_size self.loss = self.prior_loss + self.logs_loss + self.logdet_loss self.t_vars = tf.trainable_variables() # print ([v.name for v in self.t_vars]) self.numpara = 0 for var in self.t_vars: varshape = var.get_shape().as_list() self.numpara += np.prod(varshape) print("Total number of parameters: %r" % (self.numpara)) ########INFERENCE######### self.output = tf.truncated_normal([ tf.shape(self.conditions)[0], args.output_remain, tf.shape(self.conditions)[2] ], dtype=tf.float32) for i in reversed(range(args.n_flows)): if (i + 1) % args.early_output_every == 0 and (i + 1) != args.n_flows: self.newz = tf.truncated_normal([ tf.shape(self.conditions)[0], args.early_output_size, tf.shape(self.conditions)[2] ], stddev=args.infer_sigma) self.output = tf.concat([self.newz, self.output], axis=1) self.output = conv_afclayer(self.output, self.conditions, reverse=True, scope='afc_' + str(i + 1), reuse=tf.AUTO_REUSE) ########INFERENCE######### def train(self): coord = tf.train.Coordinator() self.data_generator = Feeder(coord, args) self.validation_data_generator = self.data_generator self.lr = tf.train.exponential_decay(args.lr, self.global_step, args.lr_decay_steps, args.lr_decay_rate) self.lr = tf.minimum(tf.maximum(self.lr, 0.0000001), 0.001) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.grad = self.optimizer.compute_gradients(self.loss, var_list=self.t_vars) self.op = self.optimizer.apply_gradients(self.grad, global_step=self.global_step) varset = list(set(tf.global_variables()) | set(tf.local_variables())) self.saver = tf.train.Saver(var_list=varset, max_to_keep=5) num_batch = self.data_generator.n_examples // args.batch_size do_initialzie = True if args.loading_path: if self.load(): start_epoch = self.global_step.eval() // num_batch do_initialzie = False else: print("Error Loading Model! Training From Initial State...") if do_initialzie: init_op = tf.global_variables_initializer() start_epoch = 0 self.sess.run(init_op) self.writer = tf.summary.FileWriter(args.summary_dir, None) with tf.name_scope("summaries"): self.s_logdet_loss = tf.summary.scalar('logdet_loss', self.logdet_loss) self.s_logs_loss = tf.summary.scalar('logs_loss', self.logs_loss) self.s_prior_loss = tf.summary.scalar('prior_loss', self.prior_loss) self.s_loss = tf.summary.scalar('total_loss', self.loss) self.merged = tf.summary.merge([ self.s_logdet_loss, self.s_logs_loss, self.s_prior_loss, self.s_loss ]) # self.procs = self.data_generator.start_enqueue() # self.val_procs = self.validation_data_generator.start_enqueue() # self.procs += self.val_procs self.data_generator.start(self.sess) self.sample(0) try: for epoch in range(start_epoch, args.epoch): clr = self.sess.run(self.lr) print("Current learning rate: %.6e" % clr) loss_names = [ "Total Loss", "LogS Loss", "LogDet Loss", "Prior Loss" ] buffers = buff(loss_names) for batch in tqdm(range(num_batch)): input_data = self.data_generator.dequeue() feed_dict = dict(zip(self.placeholders, input_data)) _, loss, logs_loss, logdet_loss, prior_loss, summary, step = self.sess.run( [ self.op, self.loss, self.logs_loss, self.logdet_loss, self.prior_loss, self.merged, self.global_step ], feed_dict=feed_dict) self.gate_add_summary(summary, step) buffers.put([loss, logs_loss, logdet_loss, prior_loss], [0, 1, 2, 3]) if (batch + 1) % args.display_step == 0: buffers.printout([epoch + 1, batch + 1, num_batch]) if (epoch + 1) % args.saving_epoch == 0 and args.saving_path: try: self.save(epoch + 1) except: print("Failed saving model, maybe no space left...") traceback.print_exc() if (epoch + 1) % args.sample_epoch == 0 and args.sampling_path: self.sample(epoch + 1) except KeyboardInterrupt: print("KeyboardInterrupt") except: traceback.print_exc() finally: ''' for x in self.procs: x.terminate() ''' def save(self, epoch): name = 'Model_Epoch_' + str(epoch) saving_path = os.path.join(args.saving_path, name) print("Saving Model to %r" % saving_path) step = self.sess.run(self.global_step) self.saver.save(self.sess, saving_path, global_step=step) def load(self): ckpt = tf.train.get_checkpoint_state(args.loading_path) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) print("Loading Model From %r" % os.path.join(args.loading_path, ckpt_name)) self.saver.restore(self.sess, os.path.join(args.loading_path, ckpt_name)) return True return False def sample(self, epoch): print("Sampling to %r" % args.sampling_path) try: for i in tqdm(range(args.sample_num)): name = 'Epoch_%r-%r.wav' % (epoch, i + 1) outpath = os.path.join(args.sampling_path, name) print('Sampling to %r ...' % outpath) mels = self.validation_data_generator.dequeue() output = self.sess.run(self.output, feed_dict={self.mels: [mels]}) output = np.transpose(output[0]) output = np.reshape(output, [-1]) writewav(outpath, output) except KeyboardInterrupt: print("KeyboardInterrupt") except: ''' traceback.print_exc() for x in self.procs: x.terminate() ''' def infer(self): print("Performing Inference from %r" % args.infer_mel_dir) self.saver = tf.train.Saver() try: if self.load(): step = self.global_step.eval() else: print('Error loading model at inference state!') raise RuntimeError i = 0 for mels in mel_iter(args.infer_mel_dir, size=1): outputs = self.sess.run(self.output, feed_dict={self.mels: mels}) for output in outputs: name = 'Infer_Step_%r-%r.wav' % (step, i + 1) outpath = os.path.join(args.infer_path, name) print('Synthesizing to %r ...' % outpath) output = np.transpose(output) output = np.reshape(output, [-1]) writewav(outpath, output) i += 1 if i > 32: break except KeyboardInterrupt: print("KeyboardInterrupt") def gate_add_summary(self, summary, step): try: self.writer.add_summary(summary, step) except: print("Failed adding summary, maybe no space left...")
def main(args): eval_fn = os.path.join(args.model_dir, 'eval-detailed.txt') assert os.path.exists(args.model_dir), 'Model dir does not exist.' assert args.overwrite or not os.path.exists( eval_fn), 'Evaluation file already exists.' os.environ["CUDA_VISIBLE_DEVICES"] = "%d" % args.gpu print '\n' + '=' * 30 + ' ARGUMENTS ' + '=' * 30 params = myutils.load_params(args.model_dir) for k, v in params.__dict__.iteritems(): print 'TRAIN | {}: {}'.format(k, v) for k, v in args.__dict__.iteritems(): print 'EVAL | {}: {}'.format(k, v) sys.stdout.flush() DURATION = 0.1 BATCH_SIZE = 16 with tf.device('/cpu:0'), tf.variable_scope('feeder'): feeder = Feeder(params.db_dir, subset_fn=args.subset_fn, ambi_order=params.ambi_order, audio_rate=params.audio_rate, video_rate=params.video_rate, context=params.context, duration=DURATION, return_video=VIDEO in params.encoders, img_prep=myutils.img_prep_fcn(), return_flow=FLOW in params.encoders, frame_size=(224, 448), queue_size=BATCH_SIZE * 5, n_threads=4, for_eval=True) batches = feeder.dequeue(BATCH_SIZE) ambix_batch = batches['ambix'] video_batch = batches['video'] if VIDEO in params.encoders else None flow_batch = batches['flow'] if FLOW in params.encoders else None audio_mask_batch = batches['audio_mask'] ss = int(params.audio_rate * params.context) / 2 t = int(params.audio_rate * DURATION) audio_input = ambix_batch[:, :, :params.ambi_order**2] audio_target = ambix_batch[:, ss:ss + t, params.ambi_order**2:] print '\n' + '=' * 20 + ' MODEL ' + '=' * 20 sys.stdout.flush() with tf.device('/gpu:0'): # Model num_sep = params.num_sep_tracks if params.separation != NO_SEPARATION else 1 net_params = SptAudioGenParams( sep_num_tracks=num_sep, ctx_feats_fc_units=params.context_units, loc_fc_units=params.loc_units, sep_freq_mask_fc_units=params.freq_mask_units, sep_fft_window=params.fft_window) model = SptAudioGen(ambi_order=params.ambi_order, audio_rate=params.audio_rate, video_rate=params.video_rate, context=params.context, sample_duration=DURATION, encoders=params.encoders, separation=params.separation, params=net_params) # Inference pred_t = model.inference_ops(audio=audio_input, video=video_batch, flow=flow_batch, is_training=False) # Losses and evaluation metrics with tf.variable_scope('metrics'): w_t = audio_input[:, ss:ss + t] _, stft_dist_ps, lsd_ps, mse_ps, snr_ps = model.evaluation_ops( pred_t, audio_target, w_t, mask_channels=audio_mask_batch[:, params.ambi_order**2:]) # Loader vars2save = [ v for v in tf.global_variables() if not v.op.name.startswith('metrics') ] saver = tf.train.Saver(vars2save) print '\n' + '=' * 30 + ' VARIABLES ' + '=' * 30 model_vars = tf.global_variables() import numpy as np for v in model_vars: if 'Adam' in v.op.name.split('/')[-1]: continue print ' * {:50s} | {:20s} | {:7s} | {:10s}'.format( v.op.name, str(v.get_shape()), str(np.prod(v.get_shape())), str(v.dtype)) print '\n' + '=' * 30 + ' EVALUATION ' + '=' * 30 sys.stdout.flush() config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=config) as sess: print 'Loading model...' sess.run(model.init_ops) saver.restore(sess, tf.train.latest_checkpoint(args.model_dir)) print 'Initializing data feeders...' coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord) feeder.start_threads(sess) all_metrics = [ 'amplitude/predicted', 'amplitude/gt', 'mse/avg', 'mse/X', 'mse/Y', 'mse/Z', 'stft/avg', 'stft/X', 'stft/Y', 'stft/Z', 'lsd/avg', 'lsd/X', 'lsd/Y', 'lsd/Z', 'mel_lsd/avg', 'mel_lsd/X', 'mel_lsd/Y', 'mel_lsd/Z', 'snr/avg', 'snr/X', 'snr/Y', 'snr/Z', 'env_mse/avg', 'env_mse/X', 'env_mse/Y', 'env_mse/Z', 'emd/dir', 'emd/dir2' ] metrics = OrderedDict([(key, []) for key in all_metrics]) sample_ids = [] telapsed = deque(maxlen=20) print 'Start evaluation...' it = -1 # run_options = tf.RunOptions(timeout_in_ms=60*1000) while True: it += 1 if feeder.done(sess): break start_time = time.time() outs = sess.run([ batches['id'], audio_mask_batch, w_t, audio_target, pred_t, stft_dist_ps, lsd_ps, mse_ps, snr_ps ]) video_id, layout, mono, gt, pred = outs[:5] gt_m = np.concatenate( (mono, gt), axis=2) * layout[:, np.newaxis, :] pred_m = np.concatenate( (mono, pred), axis=2) * layout[:, np.newaxis, :] stft_dist, lsd, mse, snr = outs[5:] _env_time = 0. _emd_time = 0. _pow_time = 0. _lsd_time = 0. for smp in range(BATCH_SIZE): metrics['stft/avg'].append(np.mean(stft_dist[smp])) for i, ch in zip(range(3), 'YZX'): metrics['stft/' + ch].append(stft_dist[smp, i]) metrics['lsd/avg'].append(np.mean(lsd[smp])) for i, ch in zip(range(3), 'YZX'): metrics['lsd/' + ch].append(lsd[smp, i]) metrics['mse/avg'].append(np.mean(mse[smp])) for i, ch in zip(range(3), 'YZX'): metrics['mse/' + ch].append(mse[smp, i]) metrics['snr/avg'].append(np.nanmean(snr[smp])) for i, ch in zip(range(3), 'YZX'): metrics['snr/' + ch].append(snr[smp, i]) # Compute Mel LSD distance _t = time.time() mel_lsd = myutils.compute_lsd_dist(pred[smp], gt[smp], params.audio_rate) metrics['mel_lsd/avg'].append(np.mean(mel_lsd)) for i, ch in zip(range(3), 'YZX'): metrics['mel_lsd/' + ch].append(mel_lsd[i]) _lsd_time += (time.time() - _t) # Compute envelope distances _t = time.time() env_mse = myutils.compute_envelope_dist(pred[smp], gt[smp]) metrics['env_mse/avg'].append(np.mean(env_mse)) for i, ch in zip(range(3), 'YZX'): metrics['env_mse/' + ch].append(env_mse[i]) _env_time += (time.time() - _t) # Compute EMD (for speed, only compute emd over first 0.1s of every 1sec) _t = time.time() emd_dir, emd_dir2 = ambix_emd(pred_m[smp], gt_m[smp], model.snd_rate, ang_res=30) metrics['emd/dir'].append(emd_dir) metrics['emd/dir2'].append(emd_dir2) _emd_time += (time.time() - _t) # Compute chunk power _t = time.time() metrics['amplitude/gt'].append(np.abs(gt[smp]).max()) metrics['amplitude/predicted'].append(np.abs(pred[smp]).max()) _pow_time += (time.time() - _t) sample_ids.append(video_id[smp]) telapsed.append(time.time() - start_time) #print '\nTotal:', telapsed[-1] #print 'Env:', _env_time #print 'LSD:', _lsd_time #print 'EMD:', _emd_time #print 'POW:', _pow_time if it % 100 == 0: # Store evaluation metrics with open(eval_fn, 'w') as f: f.write('SampleID | {}\n'.format(' '.join(metrics.keys()))) for smp in range(len(sample_ids)): f.write('{} | {}\n'.format( sample_ids[smp], ' '.join( [str(metrics[key][smp]) for key in metrics]))) if it % 5 == 0: stats = OrderedDict([(m, np.mean(metrics[m])) for m in all_metrics]) myutils.print_stats(stats.values(), stats.keys(), BATCH_SIZE, telapsed, it, tag='EVAL') sys.stdout.flush() # Print progress stats = OrderedDict([(m, np.mean(metrics[m])) for m in all_metrics]) myutils.print_stats(stats.values(), stats.keys(), BATCH_SIZE, telapsed, it, tag='EVAL') sys.stdout.flush() with open(eval_fn, 'w') as f: f.write('SampleID | {}\n'.format(' '.join(metrics.keys()))) for smp in range(len(sample_ids)): f.write('{} | {}\n'.format( sample_ids[smp], ' '.join([str(metrics[key][smp]) for key in metrics]))) print('\n' + '#' * 60) print('End of evaluation.')