Ejemplo n.º 1
0
def evaluate_model_preds(params_path,mode):
    """
    Given a log directory, generates predictions and returns loss metrics for full dataset. 
    mode - indicates 'val/train _text2mel/ssrn' and toggles accordingly. 
    """

    params = Params(params_path)
    print('Running predictions with model from: {}'.format(params_path))
    #os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # use all GPUs available
    params.dict['Qbatch'] = 2      # hacky - reusing batching from Supervisor
    params.dict['num_threads'] = 12 
    params.dict['num_buckets'] = 2 # simplifiying overkill queue params
    params.dict['batch_size'] = 64
    params.dict['attention_mode'] = 'guided' # gives as estimate of attention monotonocity
    g = ModelGraph(params,mode)
    logger = g.logger
    total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg = 0.0, 0.0, 0.0, 0.0

    sv = tf.train.Supervisor(logdir=params.log_dir,summary_op=None)
    with sv.managed_session() as sess:

        # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
            if 'text2mel' in mode:
                loss_out, L1_out, CE_out, att_out = sess.run([g.loss, g.L1_loss, g.CE_loss, g.att_loss])
            elif 'ssrn' in mode:
                loss_out, L1_out, CE_out = sess.run([g.loss, g.L1_loss, g.CE_loss])
                att_out = 0.0

            total_loss_avg += loss_out/g.num_batch
            L1_loss_avg += L1_out/g.num_batch
            CE_loss_avg += CE_out/g.num_batch
            att_loss_avg += att_out/g.num_batch

            if _ % 20 == 0:
                logger.info('Prediction loss: {:.4f}, L1: {:.4f}, CE: {:.4f}, Att: {:.4f}'.format(
                    loss_out, L1_out, CE_out, att_out))

    logger.info('Completed predictions: Avg loss: {:.4f}, L1: {:.4f}, CE: {:.4f}, Att: {:.4f}'.format(
                total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg))
    tf.reset_default_graph() # clean up in case of multiple function calls

    return total_loss_avg, L1_loss_avg, CE_loss_avg, att_loss_avg
Ejemplo n.º 2
0
    it = dataset.make_one_shot_iterator()
    idx, mel, mag, mel_mask = it.get_next()

    print(idx)
    print(mel)
    print(mel_mask)

    with tf.Session() as sess:
        print(sess.run([idx,mel,mel_mask]))   

def test_get_batch_prepro(tfrecord_path,params):

    logger = set_logger("./test.log")
    batch, init_op, nb_train, nb_val = get_batch_prepro(tfrecord_path,params,logger)

    mel, mel_mask = batch['mels'], batch['mels_mask']
    s1, s2 = tf.reduce_mean(mel), tf.reduce_sum(mel*mel_mask)/tf.reduce_sum(mel_mask)

    with tf.Session() as sess:
        sess.run(init_op)
        batch_dict = sess.run(batch)
        print("Mean: {}, with masking: {}".format(*sess.run([s1,s2])))

# test functions
if __name__ == '__main__':
    tfrecord_path = "../data/indic-tts-hindi/hindi-female/train.tfrecord"
    params = Params("./runs/hindi-text2melM4/params.json")

    # test_parse_tfrecord(tfrecord_path,params)
    test_get_batch_prepro(tfrecord_path,params)
Ejemplo n.º 3
0
import pdb

import tensorflow as tf
from tqdm import tqdm
from src.graph import ModelGraph
from src.utils import Params

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('params', help="Path to params.json file containing different hyperparameters")
    parser.add_argument('mode', help="Indicate which model to train. Options: train_text2mel, train_ssrn")
    parser.add_argument('--gpu', type=int, default=0,help="GPU to train on if multiple available")
    args = parser.parse_args()

    params = Params(args.params)
    print('Running a training run with params from: {}'.format(args.params))
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)  # default use single GPU
    g = ModelGraph(params,args.mode)
    logger = g.logger
    sv = tf.train.Supervisor(logdir=params.log_dir, save_model_secs=0, global_step=g.global_step)
    with sv.managed_session() as sess:
        while True:
            for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
                _, global_step, loss_out, L1_out, CE_out = sess.run([g.train_op, g.global_step,
                                                                            g.loss, g.L1_loss, g.CE_loss])
    
                if global_step % 50==0:
                    logger.info('Training loss at step {}: {:.4f}, L1: {:.4f}, CE: {:.4f}'.format(
                        global_step,loss_out, L1_out, CE_out))
Ejemplo n.º 4
0
def synthesize(m1_dir,
               m2_dir,
               sample_dir,
               n_iter=150,
               test_data=None,
               lines=None,
               ref_db=30):
    # NOTE: currently passes all input sentences as one batch

    # Initialize params
    params1 = Params(os.path.join(m1_dir, 'params.json'))
    params2 = Params(os.path.join(m2_dir, 'params.json'))
    params = params1
    if test_data is not None:
        params.dict[
            'test_data'] = test_data  # setting this based on what passed in
    params.dict['n_iter'] = n_iter
    params.dict['ref_db'] = ref_db  # output volume
    # Load text as int arrays
    if lines is None:  # Toggle whether read in a file or text passed via function call
        # text inputs
        input_arr = load_data(params, 'synthesize')
    else:
        input_arr = load_data(params, 'demo', lines)
    n_batch = input_arr.shape[0]
    params.dict['batch_size'] = n_batch
    # Create empty arrays
    output_mel = np.zeros((n_batch, params.max_T, params.F))
    output_mag = np.zeros((n_batch, params.max_T, params.Fo))
    # create flags indicating if and where each input in batch has stopped
    stop_flags = np.array([False] * n_batch)
    stop_idxs = np.zeros((n_batch, ), dtype=int)
    #last_attended = np.zeros((n_batch,))

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # use single GPUs available
    g = SynthesizeGraph(params)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        restore_checkpoints(sess, m1_dir, m2_dir)

        ## Step1: Pre-compute text encoding (K, V) = TextEncBlock(character sequence)
        K, V = sess.run([g.K_pre, g.V_pre], {g.transcripts: input_arr})

        ## Step2: Iterate over t, qt = AudioEncBlock(S:t), rt = Attention(qt,KV), St = AudioDecBlock(qt,rt)
        # TODO: Fix constrained monotonic attention
        for t in range(1, params.max_T):
            if all(stop_flags): break  # end of audio for all inputs in batch

            print('Mel frame {}/{}'.format(t + 1, params.max_T), end='\r')
            # optimization: fixed-width window to encode previously generated frames
            slice_window = max(0, t - 100)  # TODO: fix hardcoded value
            prev_slice = output_mel[:, slice_window:t, :]

            model_preds, stop_preds = sess.run([g.Y, g.YStoplogit], {
                g.K_inp: K,
                g.V_inp: V,
                g.S: prev_slice
            })
            output_mel[:, t, :] = model_preds[:, -1, :]
            track_stop_preds(stop_preds, stop_idxs, stop_flags, t)

            # monotonic contrained attention softmax
            # model_preds, attn_out = sess.run([g.Yhat,g.A],
            #     {g.S:prev_slice,g.transcripts:input_arr,g.last_attended:last_attended})
            # last_attended += np.argmax(attn_out[:,-1,:],axis=1) # slicing out the last time frame, and moving attention window forward
            # last_attended = np.clip(last_attended,a_min=0,a_max=text_lengths-params.attn_window_size)
        # output_mag, attn_out = sess.run([g.Zhat,g.A],
        #         {g.S:output_mel,g.transcripts:input_arr,g.last_attended:last_attended})

        ## Step3: Process complete utterance and invert Z = SSRN(Y:T)
        # print("Truncating. Stop idxs: {}".format(stop_idxs)) # truncate mels evenly
        output_mel = output_mel[:, :max(stop_idxs), :]
        # Convert to magnitude spectrograms
        output_mag, attn_out = sess.run([g.Zhat, g.A], {
            g.K_inp: K,
            g.V_inp: V,
            g.S: output_mel
        })
        output_mag_list = [
            output_mag[i, :stop_idxs[i] * params.reduction_factor, :]
            for i in range(n_batch)
        ]  # truncate mags individually
        # Griffin-lim inversion relatively time-taking hence parallelizing
        print("Magnitude spectrograms generated, inverting ..")
        pool_args = {}
        pool_args['sample_dir'] = sample_dir
        pool_args['params'] = params
        pool_input_list = [(output_mag_list[i], i, pool_args)
                           for i in range(n_batch)]
        with Pool(cpu_count()) as p:
            p.map(invert_mag, pool_input_list)

        for i in range(n_batch):
            fname = os.path.join(sample_dir, 'sample_{}'.format(i))
            print('Saving plots for sample: {}/{}'.format(i + 1, n_batch))
            plt.imsave(fname + '_mel.png', output_mel[i].T, cmap='gray')
            plt.imsave(fname + '_mag.png', output_mag_list[i].T, cmap='gray')
            plt.imsave(fname + '_attn.png', attn_out[i].T, cmap='gray')

    tf.reset_default_graph()
Ejemplo n.º 5
0
                        default=0,
                        help="GPU to train on if multiple available")
    parser.add_argument(
        '--chkp',
        help=
        "(For direct transfer learning) path to checkpoint dir to be restored")
    parser.add_argument('--restore-vars',
                        help="tf.GraphKey used to restore variables from CHKP",
                        default='TextEnc|AudioEnc|AudioDec')
    parser.add_argument(
        '--train-vars',
        help="tf.GraphKey used to update variables in training",
        default='InputEmbed|TextEnc|AudioEnc|AudioDec')
    args = parser.parse_args()

    params = Params(args.params)
    print('Running a training run with params from: {}'.format(args.params))
    os.environ["CUDA_VISIBLE_DEVICES"] = str(
        args.gpu)  # default use single GPU

    # Add trainable variables to params
    params.dict['trainable_vars'] = args.train_vars

    # Parse mode and setup graph
    gs = tf.train.get_or_create_global_step()
    if args.mode in 'train_text2mel':
        g = Text2MelTrainGraph(params)
    elif args.mode in 'train_ssrn':
        g = SSRNTrainGraph(params)
    elif args.mode in 'train_unsupervised':
        g = UnsupervisedTrainGraph(params)
Ejemplo n.º 6
0
parser = argparse.ArgumentParser()
parser.add_argument('checkpoint_dir',
                    help="Path to directory with checkpoints to modify")
parser.add_argument('--restore_scope',
                    help="Variable scope of variables to restore")
parser.add_argument(
    '--exclude_scope',
    help="Variable scope of new variables to exclude in first restore,\
 that are then randomly initialized and saved")
parser.add_argument('--restore_dir', help="Path to directory to restore from")
args = parser.parse_args()

checkpoint_path = tf.train.latest_checkpoint(args.checkpoint_dir)
restore_path = tf.train.latest_checkpoint(
    args.restore_dir) if args.restore_dir else checkpoint_path
params = Params(os.path.join(args.checkpoint_dir, 'params.json'))
gs = tf.train.get_or_create_global_step()
g = ModelGraph(params, 'train_text2mel')

save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

pdb.set_trace()
restore_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                 args.restore_scope)
exclude_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                 args.exclude_scope)
restore_vars = [var for var in restore_vars if var not in exclude_vars]

restorer = tf.train.Saver(restore_vars)
saver = tf.train.Saver(save_vars)
Ejemplo n.º 7
0
def main(argv):
    """
    Main driver/runner of 3D U-Net model.
    """
    
    # -------------------------------------------------------------------------
    # setup
    # -------------------------------------------------------------------------

    # set the random seed for the whole graph for reproductible experiments
    tf.set_random_seed(42)

    # load the parameters from model's json file as a dict
    args = arg_parser(argv)
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path).dict
    
    # check mode
    modes = ['train', 'train_eval', 'eval', 'predict']
    assert args.mode in modes, "mode has to be one of %s" % ','.join(modes) 
    
    # create logger, add loss and IOU to logging
    logger = set_logger(os.path.join(args.model_dir, 'train.log'))
    
    # -------------------------------------------------------------------------
    # create model
    # -------------------------------------------------------------------------
    
    model = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=args.model_dir,
        params=params,
        config=tf.estimator.RunConfig(
            log_step_count_steps=params['display_steps']
        )
    )
    
    # -------------------------------------------------------------------------
    # train
    # -------------------------------------------------------------------------
    
    if args.mode in ['train_eval', 'train']:
        model.train(
            input_fn=lambda: input_fn(True, params),
            max_steps=params['max_train_steps']
        )
    
    # -------------------------------------------------------------------------
    # evaluate
    # -------------------------------------------------------------------------
    
    if args.mode in ['train_eval', 'eval']:
        model.evaluate(input_fn=lambda: input_fn(False, params))
    
    # -------------------------------------------------------------------------
    # predict
    # -------------------------------------------------------------------------
    
    if args.mode == 'predict':
        predictions = model.predict(input_fn=lambda: input_fn(False, params))

        # extract predictions, only save predicted classes not probs
        to_save = dict()
        for i, y_pred in enumerate(predictions):
            if i in args.pred_ix:
                logger.info('Predicting patient: %d.' % i)
                to_save[i] = y_pred
        
        # save them with pickle to model dir
        pred_file = os.path.join(args.model_dir, 'preds.npy')
        pickle.dump(to_save, open(pred_file,"wb"))
        logger.info('Predictions saved to: %s.' % pred_file)
Ejemplo n.º 8
0
def synthesize(m1_dir,
               m2_dir,
               sample_dir,
               n_iter=150,
               test_data=None,
               lines=None,
               ref_db=30):
    # NOTE: currently passes all input sentences as one batch

    # Initialize graph, path to model checkpoints
    params1 = Params(os.path.join(m1_dir, 'params.json'))
    params2 = Params(os.path.join(m2_dir, 'params.json'))
    params = params1
    if test_data is not None:
        params.dict[
            'test_data'] = test_data  # setting this based on what passed in
    params.dict['n_iter'] = n_iter
    params.dict['ref_db'] = ref_db  # output volume

    if lines is None:  # Toggle whether read in a file or text passed via function call
        # text inputs
        input_arr = load_data(params, 'synthesize')
    else:
        input_arr = load_data(params, 'demo', lines)

    # shape (1, len)
    n_batch, max_text_len = input_arr.shape[0], input_arr.shape[1]
    text_lengths = np.zeros((n_batch, ))
    for i in range(n_batch):
        for j in range(max_text_len - 1, -1, -1):
            if input_arr[i, j] != 0:
                text_lengths[i] = j
                break
    params.dict['batch_size'] = n_batch
    output_mel = np.zeros((n_batch, params.max_T, params.F))
    output_mag = np.zeros((n_batch, params.max_T, params.Fo))
    last_attended = np.zeros((n_batch, ))

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # use single GPUs available
    g = ModelGraph(params, 'synthesize')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        # Load saved models
        text2mel_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            'TextEnc|AudioEnc|AudioDec|InputEmbed')
        ssrn_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN')

        saver1 = tf.train.Saver(var_list=text2mel_vars)
        saver1.restore(sess, tf.train.latest_checkpoint(m1_dir))
        print("Text2Mel Restored!")
        saver2 = tf.train.Saver(var_list=ssrn_vars)
        saver2.restore(sess, tf.train.latest_checkpoint(m2_dir))
        print("SSRN Restored!")

        n_samples = output_mag.shape[0]
        # Generate all the mel frames
        # TODO: Fix constrained monotonic attention
        for i in range(1, params.max_T):
            print(last_attended)
            print('Mel frame {}/{}'.format(i + 1, params.max_T), end='\r')
            prev_slice = output_mel[:, :i, :]
            model_out, attn_out = sess.run(
                [g.Yhat, g.A], {
                    g.S: prev_slice,
                    g.transcripts: input_arr,
                    g.last_attended: last_attended
                })
            last_attended += np.argmax(
                attn_out[:, -1, :], axis=1
            )  # slicing out the last time frame, and moving attention window forward
            last_attended = np.clip(last_attended,
                                    a_min=0,
                                    a_max=text_lengths -
                                    params.attn_window_size)
            output_mel[:, i, :] = model_out[:, -1, :]

        # Convert to magnitude spectrograms
        output_mag, attn_out = sess.run(
            [g.Zhat, g.A], {
                g.S: output_mel,
                g.transcripts: input_arr,
                g.last_attended: last_attended
            })
        print("Magnitude spectrograms generated, inverting ..")
        pool_args = {}
        pool_args['sample_dir'] = sample_dir
        pool_args['params'] = params

        mags_list = [(output_mag[i], i, pool_args) for i in range(n_samples)]

        # Griffin-lim inversion seems to be relatively time-taking hence parallelizing
        with Pool(cpu_count()) as p:
            p.map(invert_mag, mags_list)
        for i in range(n_samples):
            fname = os.path.join(sample_dir, 'sample_{}'.format(i))
            print('Saving plots for sample: {}/{}'.format(i + 1, n_samples))
            plt.imsave(fname + '_mel.png', output_mel[i].T, cmap='gray')
            plt.imsave(fname + '_mag.png', output_mag[i].T, cmap='gray')
            plt.imsave(fname + '_attn.png', attn_out[i].T, cmap='gray')

    tf.reset_default_graph()
Ejemplo n.º 9
0
    sys.stdout.flush()


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'params_path',
        help="Path to params.json file containing DSP hyperparameters")
    parser.add_argument('input_path',
                        help="Path to folder containing .wav files")
    parser.add_argument('csv_path',
                        help="Path to file with metadata: text, wav filename")
    parser.add_argument(
        'output_path',
        help="Path to output folder that will contain mels, mags folders")
    parser.add_argument(
        '--mode',
        default='tfrecord',
        help="Format to save processed data files npy/tfrecord (default)")
    args = parser.parse_args()

    params, output_path = Params(args.params_path), args.output_path

    if args.mode == 'npy':
        process_to_npy(params, args.input_path, args.csv_path,
                       args.output_path)
    elif args.mode == 'tfrecord':
        process_to_tfrecord(params, args.input_path, args.csv_path,
                            args.output_path)