def evaluate_model_preds(params_path,mode): """ Given a log directory, generates predictions and returns loss metrics for full dataset. mode - indicates 'val/train _text2mel/ssrn' and toggles accordingly. """ params = Params(params_path) print('Running predictions with model from: {}'.format(params_path)) #os.environ["CUDA_VISIBLE_DEVICES"] = "0" # use all GPUs available params.dict['Qbatch'] = 2 # hacky - reusing batching from Supervisor params.dict['num_threads'] = 12 params.dict['num_buckets'] = 2 # simplifiying overkill queue params params.dict['batch_size'] = 64 params.dict['attention_mode'] = 'guided' # gives as estimate of attention monotonocity g = ModelGraph(params,mode) logger = g.logger total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg = 0.0, 0.0, 0.0, 0.0 sv = tf.train.Supervisor(logdir=params.log_dir,summary_op=None) with sv.managed_session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): if 'text2mel' in mode: loss_out, L1_out, CE_out, att_out = sess.run([g.loss, g.L1_loss, g.CE_loss, g.att_loss]) elif 'ssrn' in mode: loss_out, L1_out, CE_out = sess.run([g.loss, g.L1_loss, g.CE_loss]) att_out = 0.0 total_loss_avg += loss_out/g.num_batch L1_loss_avg += L1_out/g.num_batch CE_loss_avg += CE_out/g.num_batch att_loss_avg += att_out/g.num_batch if _ % 20 == 0: logger.info('Prediction loss: {:.4f}, L1: {:.4f}, CE: {:.4f}, Att: {:.4f}'.format( loss_out, L1_out, CE_out, att_out)) logger.info('Completed predictions: Avg loss: {:.4f}, L1: {:.4f}, CE: {:.4f}, Att: {:.4f}'.format( total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg)) tf.reset_default_graph() # clean up in case of multiple function calls return total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg
it = dataset.make_one_shot_iterator() idx, mel, mag, mel_mask = it.get_next() print(idx) print(mel) print(mel_mask) with tf.Session() as sess: print(sess.run([idx,mel,mel_mask])) def test_get_batch_prepro(tfrecord_path,params): logger = set_logger("./test.log") batch, init_op, nb_train, nb_val = get_batch_prepro(tfrecord_path,params,logger) mel, mel_mask = batch['mels'], batch['mels_mask'] s1, s2 = tf.reduce_mean(mel), tf.reduce_sum(mel*mel_mask)/tf.reduce_sum(mel_mask) with tf.Session() as sess: sess.run(init_op) batch_dict = sess.run(batch) print("Mean: {}, with masking: {}".format(*sess.run([s1,s2]))) # test functions if __name__ == '__main__': tfrecord_path = "../data/indic-tts-hindi/hindi-female/train.tfrecord" params = Params("./runs/hindi-text2melM4/params.json") # test_parse_tfrecord(tfrecord_path,params) test_get_batch_prepro(tfrecord_path,params)
import pdb import tensorflow as tf from tqdm import tqdm from src.graph import ModelGraph from src.utils import Params if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('params', help="Path to params.json file containing different hyperparameters") parser.add_argument('mode', help="Indicate which model to train. Options: train_text2mel, train_ssrn") parser.add_argument('--gpu', type=int, default=0,help="GPU to train on if multiple available") args = parser.parse_args() params = Params(args.params) print('Running a training run with params from: {}'.format(args.params)) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) # default use single GPU g = ModelGraph(params,args.mode) logger = g.logger sv = tf.train.Supervisor(logdir=params.log_dir, save_model_secs=0, global_step=g.global_step) with sv.managed_session() as sess: while True: for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): # sess = tf_debug.LocalCLIDebugWrapperSession(sess) _, global_step, loss_out, L1_out, CE_out = sess.run([g.train_op, g.global_step, g.loss, g.L1_loss, g.CE_loss]) if global_step % 50==0: logger.info('Training loss at step {}: {:.4f}, L1: {:.4f}, CE: {:.4f}'.format( global_step,loss_out, L1_out, CE_out))
def synthesize(m1_dir, m2_dir, sample_dir, n_iter=150, test_data=None, lines=None, ref_db=30): # NOTE: currently passes all input sentences as one batch # Initialize params params1 = Params(os.path.join(m1_dir, 'params.json')) params2 = Params(os.path.join(m2_dir, 'params.json')) params = params1 if test_data is not None: params.dict[ 'test_data'] = test_data # setting this based on what passed in params.dict['n_iter'] = n_iter params.dict['ref_db'] = ref_db # output volume # Load text as int arrays if lines is None: # Toggle whether read in a file or text passed via function call # text inputs input_arr = load_data(params, 'synthesize') else: input_arr = load_data(params, 'demo', lines) n_batch = input_arr.shape[0] params.dict['batch_size'] = n_batch # Create empty arrays output_mel = np.zeros((n_batch, params.max_T, params.F)) output_mag = np.zeros((n_batch, params.max_T, params.Fo)) # create flags indicating if and where each input in batch has stopped stop_flags = np.array([False] * n_batch) stop_idxs = np.zeros((n_batch, ), dtype=int) #last_attended = np.zeros((n_batch,)) os.environ["CUDA_VISIBLE_DEVICES"] = "0" # use single GPUs available g = SynthesizeGraph(params) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) restore_checkpoints(sess, m1_dir, m2_dir) ## Step1: Pre-compute text encoding (K, V) = TextEncBlock(character sequence) K, V = sess.run([g.K_pre, g.V_pre], {g.transcripts: input_arr}) ## Step2: Iterate over t, qt = AudioEncBlock(S:t), rt = Attention(qt,KV), St = AudioDecBlock(qt,rt) # TODO: Fix constrained monotonic attention for t in range(1, params.max_T): if all(stop_flags): break # end of audio for all inputs in batch print('Mel frame {}/{}'.format(t + 1, params.max_T), end='\r') # optimization: fixed-width window to encode previously generated frames slice_window = max(0, t - 100) # TODO: fix hardcoded value prev_slice = output_mel[:, slice_window:t, :] model_preds, stop_preds = sess.run([g.Y, g.YStoplogit], { g.K_inp: K, g.V_inp: V, g.S: prev_slice }) output_mel[:, t, :] = model_preds[:, -1, :] track_stop_preds(stop_preds, stop_idxs, stop_flags, t) # monotonic contrained attention softmax # model_preds, attn_out = sess.run([g.Yhat,g.A], # {g.S:prev_slice,g.transcripts:input_arr,g.last_attended:last_attended}) # last_attended += np.argmax(attn_out[:,-1,:],axis=1) # slicing out the last time frame, and moving attention window forward # last_attended = np.clip(last_attended,a_min=0,a_max=text_lengths-params.attn_window_size) # output_mag, attn_out = sess.run([g.Zhat,g.A], # {g.S:output_mel,g.transcripts:input_arr,g.last_attended:last_attended}) ## Step3: Process complete utterance and invert Z = SSRN(Y:T) # print("Truncating. Stop idxs: {}".format(stop_idxs)) # truncate mels evenly output_mel = output_mel[:, :max(stop_idxs), :] # Convert to magnitude spectrograms output_mag, attn_out = sess.run([g.Zhat, g.A], { g.K_inp: K, g.V_inp: V, g.S: output_mel }) output_mag_list = [ output_mag[i, :stop_idxs[i] * params.reduction_factor, :] for i in range(n_batch) ] # truncate mags individually # Griffin-lim inversion relatively time-taking hence parallelizing print("Magnitude spectrograms generated, inverting ..") pool_args = {} pool_args['sample_dir'] = sample_dir pool_args['params'] = params pool_input_list = [(output_mag_list[i], i, pool_args) for i in range(n_batch)] with Pool(cpu_count()) as p: p.map(invert_mag, pool_input_list) for i in range(n_batch): fname = os.path.join(sample_dir, 'sample_{}'.format(i)) print('Saving plots for sample: {}/{}'.format(i + 1, n_batch)) plt.imsave(fname + '_mel.png', output_mel[i].T, cmap='gray') plt.imsave(fname + '_mag.png', output_mag_list[i].T, cmap='gray') plt.imsave(fname + '_attn.png', attn_out[i].T, cmap='gray') tf.reset_default_graph()
default=0, help="GPU to train on if multiple available") parser.add_argument( '--chkp', help= "(For direct transfer learning) path to checkpoint dir to be restored") parser.add_argument('--restore-vars', help="tf.GraphKey used to restore variables from CHKP", default='TextEnc|AudioEnc|AudioDec') parser.add_argument( '--train-vars', help="tf.GraphKey used to update variables in training", default='InputEmbed|TextEnc|AudioEnc|AudioDec') args = parser.parse_args() params = Params(args.params) print('Running a training run with params from: {}'.format(args.params)) os.environ["CUDA_VISIBLE_DEVICES"] = str( args.gpu) # default use single GPU # Add trainable variables to params params.dict['trainable_vars'] = args.train_vars # Parse mode and setup graph gs = tf.train.get_or_create_global_step() if args.mode in 'train_text2mel': g = Text2MelTrainGraph(params) elif args.mode in 'train_ssrn': g = SSRNTrainGraph(params) elif args.mode in 'train_unsupervised': g = UnsupervisedTrainGraph(params)
parser = argparse.ArgumentParser() parser.add_argument('checkpoint_dir', help="Path to directory with checkpoints to modify") parser.add_argument('--restore_scope', help="Variable scope of variables to restore") parser.add_argument( '--exclude_scope', help="Variable scope of new variables to exclude in first restore,\ that are then randomly initialized and saved") parser.add_argument('--restore_dir', help="Path to directory to restore from") args = parser.parse_args() checkpoint_path = tf.train.latest_checkpoint(args.checkpoint_dir) restore_path = tf.train.latest_checkpoint( args.restore_dir) if args.restore_dir else checkpoint_path params = Params(os.path.join(args.checkpoint_dir, 'params.json')) gs = tf.train.get_or_create_global_step() g = ModelGraph(params, 'train_text2mel') save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) pdb.set_trace() restore_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, args.restore_scope) exclude_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, args.exclude_scope) restore_vars = [var for var in restore_vars if var not in exclude_vars] restorer = tf.train.Saver(restore_vars) saver = tf.train.Saver(save_vars)
def main(argv): """ Main driver/runner of 3D U-Net model. """ # ------------------------------------------------------------------------- # setup # ------------------------------------------------------------------------- # set the random seed for the whole graph for reproductible experiments tf.set_random_seed(42) # load the parameters from model's json file as a dict args = arg_parser(argv) json_path = os.path.join(args.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path).dict # check mode modes = ['train', 'train_eval', 'eval', 'predict'] assert args.mode in modes, "mode has to be one of %s" % ','.join(modes) # create logger, add loss and IOU to logging logger = set_logger(os.path.join(args.model_dir, 'train.log')) # ------------------------------------------------------------------------- # create model # ------------------------------------------------------------------------- model = tf.estimator.Estimator( model_fn=model_fn, model_dir=args.model_dir, params=params, config=tf.estimator.RunConfig( log_step_count_steps=params['display_steps'] ) ) # ------------------------------------------------------------------------- # train # ------------------------------------------------------------------------- if args.mode in ['train_eval', 'train']: model.train( input_fn=lambda: input_fn(True, params), max_steps=params['max_train_steps'] ) # ------------------------------------------------------------------------- # evaluate # ------------------------------------------------------------------------- if args.mode in ['train_eval', 'eval']: model.evaluate(input_fn=lambda: input_fn(False, params)) # ------------------------------------------------------------------------- # predict # ------------------------------------------------------------------------- if args.mode == 'predict': predictions = model.predict(input_fn=lambda: input_fn(False, params)) # extract predictions, only save predicted classes not probs to_save = dict() for i, y_pred in enumerate(predictions): if i in args.pred_ix: logger.info('Predicting patient: %d.' % i) to_save[i] = y_pred # save them with pickle to model dir pred_file = os.path.join(args.model_dir, 'preds.npy') pickle.dump(to_save, open(pred_file,"wb")) logger.info('Predictions saved to: %s.' % pred_file)
def synthesize(m1_dir, m2_dir, sample_dir, n_iter=150, test_data=None, lines=None, ref_db=30): # NOTE: currently passes all input sentences as one batch # Initialize graph, path to model checkpoints params1 = Params(os.path.join(m1_dir, 'params.json')) params2 = Params(os.path.join(m2_dir, 'params.json')) params = params1 if test_data is not None: params.dict[ 'test_data'] = test_data # setting this based on what passed in params.dict['n_iter'] = n_iter params.dict['ref_db'] = ref_db # output volume if lines is None: # Toggle whether read in a file or text passed via function call # text inputs input_arr = load_data(params, 'synthesize') else: input_arr = load_data(params, 'demo', lines) # shape (1, len) n_batch, max_text_len = input_arr.shape[0], input_arr.shape[1] text_lengths = np.zeros((n_batch, )) for i in range(n_batch): for j in range(max_text_len - 1, -1, -1): if input_arr[i, j] != 0: text_lengths[i] = j break params.dict['batch_size'] = n_batch output_mel = np.zeros((n_batch, params.max_T, params.F)) output_mag = np.zeros((n_batch, params.max_T, params.Fo)) last_attended = np.zeros((n_batch, )) os.environ["CUDA_VISIBLE_DEVICES"] = "0" # use single GPUs available g = ModelGraph(params, 'synthesize') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Load saved models text2mel_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'TextEnc|AudioEnc|AudioDec|InputEmbed') ssrn_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') saver1 = tf.train.Saver(var_list=text2mel_vars) saver1.restore(sess, tf.train.latest_checkpoint(m1_dir)) print("Text2Mel Restored!") saver2 = tf.train.Saver(var_list=ssrn_vars) saver2.restore(sess, tf.train.latest_checkpoint(m2_dir)) print("SSRN Restored!") n_samples = output_mag.shape[0] # Generate all the mel frames # TODO: Fix constrained monotonic attention for i in range(1, params.max_T): print(last_attended) print('Mel frame {}/{}'.format(i + 1, params.max_T), end='\r') prev_slice = output_mel[:, :i, :] model_out, attn_out = sess.run( [g.Yhat, g.A], { g.S: prev_slice, g.transcripts: input_arr, g.last_attended: last_attended }) last_attended += np.argmax( attn_out[:, -1, :], axis=1 ) # slicing out the last time frame, and moving attention window forward last_attended = np.clip(last_attended, a_min=0, a_max=text_lengths - params.attn_window_size) output_mel[:, i, :] = model_out[:, -1, :] # Convert to magnitude spectrograms output_mag, attn_out = sess.run( [g.Zhat, g.A], { g.S: output_mel, g.transcripts: input_arr, g.last_attended: last_attended }) print("Magnitude spectrograms generated, inverting ..") pool_args = {} pool_args['sample_dir'] = sample_dir pool_args['params'] = params mags_list = [(output_mag[i], i, pool_args) for i in range(n_samples)] # Griffin-lim inversion seems to be relatively time-taking hence parallelizing with Pool(cpu_count()) as p: p.map(invert_mag, mags_list) for i in range(n_samples): fname = os.path.join(sample_dir, 'sample_{}'.format(i)) print('Saving plots for sample: {}/{}'.format(i + 1, n_samples)) plt.imsave(fname + '_mel.png', output_mel[i].T, cmap='gray') plt.imsave(fname + '_mag.png', output_mag[i].T, cmap='gray') plt.imsave(fname + '_attn.png', attn_out[i].T, cmap='gray') tf.reset_default_graph()
sys.stdout.flush() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( 'params_path', help="Path to params.json file containing DSP hyperparameters") parser.add_argument('input_path', help="Path to folder containing .wav files") parser.add_argument('csv_path', help="Path to file with metadata: text, wav filename") parser.add_argument( 'output_path', help="Path to output folder that will contain mels, mags folders") parser.add_argument( '--mode', default='tfrecord', help="Format to save processed data files npy/tfrecord (default)") args = parser.parse_args() params, output_path = Params(args.params_path), args.output_path if args.mode == 'npy': process_to_npy(params, args.input_path, args.csv_path, args.output_path) elif args.mode == 'tfrecord': process_to_tfrecord(params, args.input_path, args.csv_path, args.output_path)