Beispiel #1
0
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # in_dir = ~/wav
    parser.add_argument("--in_dir",
                        type=str,
                        required=True,
                        help="input data(pickle) dir")
    parser.add_argument(
        "--ckpt_dir",
        type=str,
        required=True,
        help="checkpoint to save/ start with for train/inference")
    parser.add_argument("--mode",
                        default="train",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    # Saving Checkpoints, Data... etc
    parser.add_argument("--max_step",
                        type=int,
                        default=500000,
                        help="maximum steps in training")
    parser.add_argument("--checkpoint_freq",
                        type=int,
                        default=100,
                        help="how often save checkpoint")

    # Data
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")

    # Ininitialization
    parser.add_argument("--init_type",
                        type=str,
                        default="uniform",
                        help="type of initializer")
    parser.add_argument("--init_weight_range",
                        type=float,
                        default=0.1,
                        help="initial weight ranges from -0.1 to 0.1")

    # Optimization
    parser.add_argument("--loss_type",
                        default="softmax",
                        choices=["softmax", "contrast"],
                        help="loss type for optimization")
    parser.add_argument("--optimizer",
                        type=str,
                        default="sgd",
                        help="type of optimizer")
    parser.add_argument("--learning_rate",
                        type=float,
                        default=0.01,
                        help="learning rate")
    parser.add_argument("--l2_norm_clip",
                        type=float,
                        default=3.0,
                        help="L2-norm of gradient is clipped at")

    # Train
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=64,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--lstm_proj_clip",
                        type=float,
                        default=0.5,
                        help="Gradient scale for projection node in LSTM")
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Scaled Cosine similarity
    parser.add_argument(
        "--scale_clip",
        type=float,
        default=0.01,
        help="Gradient scale for scale values in scaled cosine similarity")

    # Collect hparams
    args = parser.parse_args()

    # Set up Queue
    global_queue = queue.Queue()
    # Set up Feeder
    libri_feeder = Feeder(args, "train", "libri")
    libri_feeder.set_up_feeder(global_queue)

    vox1_feeder = Feeder(args, "train", "vox1")
    vox1_feeder.set_up_feeder(global_queue)

    vox2_feeder = Feeder(args, "train", "vox2")
    vox2_feeder.set_up_feeder(global_queue)

    # Set up Model

    model = GE2E(args)
    graph = model.set_up_model("train")

    # Training
    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:

        train_writer = tf.summary.FileWriter(args.ckpt_dir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(args.ckpt_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Restoring Variables from {}'.format(
                ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            start_step = sess.run(model.global_step)

        else:
            print('start from 0')
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            start_step = 1

        for num_step in range(start_step, args.max_step + 1):

            print("current step: " + str(num_step) + "th step")

            batch = global_queue.get()

            summary, training_loss, _ = sess.run(
                [model.sim_mat_summary, model.total_loss, model.optimize],
                feed_dict={
                    model.input_batch: batch[0],
                    model.target_batch: batch[1]
                })
            train_writer.add_summary(summary, num_step)
            print("batch loss:" + str(training_loss))

            if num_step % args.checkpoint_freq == 0:
                save_path = saver.save(sess,
                                       args.ckpt_dir + "/model.ckpt",
                                       global_step=model.global_step)
                print("model saved in file: %s / %d th step" %
                      (save_path, sess.run(model.global_step)))
Beispiel #2
0
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--in_dir", type=str, required=True, help="input dir")
    parser.add_argument("--out_dir", type=str, required=True, help="out dir")
    parser.add_argument("--batch_inference",
                        action="store_true",
                        help="set whether to use the batch inference")
    parser.add_argument("--dataset", type=str, default="libri", help="out dir")
    parser.add_argument("--in_wav1", type=str, help="input wav1 dir")
    parser.add_argument("--in_wav2",
                        default="temp.wav",
                        type=str,
                        help="input wav2 dir")
    #/home/hdd2tb/ninas96211/dev_wav_set
    parser.add_argument("--mode",
                        default="infer",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    parser.add_argument("--ckpt_file",
                        type=str,
                        default='./xckpt/model.ckpt-58100',
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")
    parser.add_argument('--gpu', default='0', help='Path to model checkpoint')
    parser.add_argument('--gpu_num',
                        default=4,
                        help='Path to model checkpoint')

    # Collect hparams
    args = parser.parse_args()

    import os
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    feeder = Feeder(args)
    feeder.set_up_feeder()

    model = GE2E(args)
    graph = model.set_up_model()

    #Training
    with graph.as_default():
        saver = tf.train.Saver()

    #num_gpu=4
    #sess_arr=[]
    #for i in range(num_gpu):
    #    gpu_options = tf.GPUOptions(visible_device_list=str(i))
    #    sess_arr.append(tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)))
    #    saver.restore(sess_arr[i], args.ckpt_file)

    #    t = Thread(target=worker)
    #    t.daemon = True
    #    t.start()

    #    save_dvector_of_dir_parallel(sess_arr, feeder, model, args)

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)

        #get_dvector_of_dir(sess, feeder, model, args)

        t = Thread(target=worker)
        t.daemon = True
        t.start()
        save_dvector_of_dir_parallel(sess, feeder, model, args)
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--in_wav1",
                        type=str,
                        required=True,
                        help="input wav1 dir")
    parser.add_argument("--in_wav2",
                        type=str,
                        required=True,
                        help="input wav2 dir")
    #/home/hdd2tb/ninas96211/dev_wav_set
    parser.add_argument("--mode",
                        default="infer",
                        choices=["train", "test", "infer"],
                        help="setting mode for execution")

    parser.add_argument("--ckpt_file",
                        type=str,
                        required=True,
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Collect hparams
    args = parser.parse_args()

    feeder = Feeder(args, "infer")
    feeder.set_up_feeder()

    model = GE2E(args)
    graph = model.set_up_model("infer")

    # Training

    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)

        wav1_data, wav2_data, match = feeder.create_infer_batch()

        # score

        wav1_out = sess.run(model.norm_out,
                            feed_dict={model.input_batch: wav1_data})
        wav2_out = sess.run(model.norm_out,
                            feed_dict={model.input_batch: wav2_data})

        wav1_dvector = np.mean(wav1_out, axis=0)
        wav2_dvector = np.mean(wav2_out, axis=0)

        final_score = np.dot(wav1_dvector, wav2_dvector) / (
            np.linalg.norm(wav1_dvector) * np.linalg.norm(wav2_dvector))

        print("final score:" + str(final_score))
        print("same? :" + str(match))
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # Path

    # wav name formatting: id_clip_uttnum.wav
    parser.add_argument("--test_dir",
                        type=str,
                        required=True,
                        help="input test dir")
    #/home/hdd2tb/ninas96211/dev_wav_set

    parser.add_argument("--ckpt_file",
                        type=str,
                        required=True,
                        help="checkpoint to start with for inference")

    # Data
    #parser.add_argument("--window_length", type=int, default=160, help="sliding window length(frames)")
    parser.add_argument("--segment_length",
                        type=float,
                        default=1.6,
                        help="segment length in seconds")
    parser.add_argument("--overlap_ratio",
                        type=float,
                        default=0.5,
                        help="overlaping percentage")
    parser.add_argument("--spectrogram_scale",
                        type=int,
                        default=40,
                        help="scale of the input spectrogram")
    # Enrol
    parser.add_argument("--num_spk_per_batch",
                        type=int,
                        default=5,
                        help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch",
                        type=int,
                        default=10,
                        help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--num_lstm_stacks",
                        type=int,
                        default=3,
                        help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells",
                        type=int,
                        default=768,
                        help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection",
                        type=int,
                        default=256,
                        help="dimension of LSTM projection")

    # Total Score
    # (Sum of True Scores) -  (Sum of False Scores)
    # if the model is perfect, the score will be num_of_true_pairs
    # if the model really sucks, the score will be - num_of_false_pairs

    # Collect hparams
    args = parser.parse_args()
    global_queue = queue.Queue()
    feeder = Feeder(args, "test")
    feeder.set_up_feeder(global_queue)

    model = GE2E(args)
    graph = model.set_up_model("test")

    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph) as sess:
        # restore from checkpoints

        saver.restore(sess, args.ckpt_file)
        total_score = 0
        num_true_pairs = 0
        num_false_pairs = 0

        while len(feeder.wav_pairs) > 0:
            wav1_data, wav2_data, match = global_queue.get()
            wav1_out = sess.run(model.norm_out,
                                feed_dict={model.input_batch: wav1_data})
            wav2_out = sess.run(model.norm_out,
                                feed_dict={model.input_batch: wav2_data})
            wav1_dvector = np.mean(wav1_out, axis=0)
            wav2_dvector = np.mean(wav2_out, axis=0)
            final_score = np.dot(wav1_dvector, wav2_dvector) / (
                np.linalg.norm(wav1_dvector) * np.linalg.norm(wav2_dvector))
            print("final score:" + str(final_score))
            print("same? :" + str(match))
            if match == True:
                total_score += final_score
                num_true_pairs += 1
            if match == False:
                total_score -= final_score
                num_false_pairs += 1

    print("in total: " + str(total_score))
    print("num true pairs: " + str(num_true_pairs))
    print("num false pairs: " + str(num_false_pairs))
Beispiel #5
0
def main():

    # Hyperparameters

    parser = argparse.ArgumentParser()

    # in_dir = ~/wav
    parser.add_argument("--in_dir", type=str, required=True, help="input data(pickle) dir")
    parser.add_argument("--ckpt_dir", type=str, required=True, help="checkpoint to save/ start with for train/inference")
    parser.add_argument("--mode", default="train", choices=["train", "test", "infer"], help="setting mode for execution")
    parser.add_argument('--data_types', nargs='+', default=['libri', 'vox1', 'vox2'])

    # Saving Checkpoints, Data... etc
    parser.add_argument("--max_step", type=int, default=50000, help="maximum steps in training")
    parser.add_argument("--checkpoint_freq", type=int, default=1000, help="how often save checkpoint")
    parser.add_argument("--eval_freq", type=int, default=1, help="how often do the evaluation")

    # Data
    parser.add_argument("--segment_length", type=float, default=1.6, help="segment length in seconds")
    parser.add_argument("--spectrogram_scale", type=int, default=40,
                                           help="scale of the input spectrogram")

    # Ininitialization
    parser.add_argument("--init_type", type=str, default="uniform", help="type of initializer")
    parser.add_argument("--init_weight_range", type=float, default=0.1, help="initial weight ranges from -0.1 to 0.1")

    # Optimization
    parser.add_argument("--loss_type", default="softmax", choices=["softmax", "contrast"], help="loss type for optimization")
    parser.add_argument("--optimizer", type=str, default="sgd", help="type of optimizer")
    parser.add_argument("--learning_rate", type=float, default=0.01, help="learning rate")
    parser.add_argument("--l2_norm_clip", type=float, default=3.0, help="L2-norm of gradient is clipped at")

    # Train
    parser.add_argument("--num_spk_per_batch", type=int, default=55,
                                           help="N speakers of batch size N*M")
    parser.add_argument("--num_utt_per_batch", type=int, default= 10,
                                           help="M utterances of batch size N*M")

    # LSTM
    parser.add_argument("--lstm_proj_clip", type=float, default=0.5, help="Gradient scale for projection node in LSTM")
    parser.add_argument("--num_lstm_stacks", type=int, default=3, help="number of LSTM stacks")
    parser.add_argument("--num_lstm_cells", type=int, default=768, help="number of LSTM cells")
    parser.add_argument("--dim_lstm_projection", type=int, default=256, help="dimension of LSTM projection")

    # Scaled Cosine similarity
    parser.add_argument("--scale_clip", type=float, default=0.01, help="Gradient scale for scale values in scaled cosine similarity")

    # Collect hparams
    args = parser.parse_args()

    # Set up Queue
    global_queue = queue.Queue()
    eval_queue = queue.Queue()
    # Set up Feeder
    # libri_feeder = Feeder(args, "libri")
    # libri_feeder = Feeder(args, "librispeech/libritest")
    # libri_feeder.set_up_feeder(global_queue)

    #vox1_feeder = Feeder(args, "vox1")
    #vox1_feeder.set_up_feeder(global_queue)

    #vox2_feeder = Feeder(args, "vox2")
    #vox2_feeder.set_up_feeder(global_queue)

    # eval_feeder = Feeder(args, "librispeech/libritest")
    # eval_feeder.set_up_feeder(eval_queue)

    test_feeder = Feeder_without_Queue(args, "librispeech/libritest")
    test_feeder.set_up_feeder()

    # while True:
    #     batch = test_feeder.create_train_batch()
        # print(in_data.shape, out_data.shape)
        # print(batch[0].shape, batch[1].shape)
    #
    #
    # exit()

    # exit()
    # Set up Model

    model = GE2E(args)
    graph = model.set_up_model()

    # Training
    with graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        train_writer = tf.summary.FileWriter(args.ckpt_dir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(args.ckpt_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Restoring Variables from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(sess, ckpt.model_checkpoint_path)
            start_step = sess.run(model.global_step)

        else:
            print('start from 0')
            init_op = tf.global_variables_initializer()
            sess.run(init_op)
            start_step = 1
        # exit()
        start = 0
        from tensorflow.python.client import timeline
        for num_step in range(start_step, args.max_step + 1):
        #for num_step in range(start_step, start_step + 1):
            print("current step: %dth step, time consumed: %f sec" % (num_step, time.time()-start))
            start=time.time()

            start1=time.time()
            # batch = global_queue.get()
            batch = test_feeder.create_train_batch()
            print("get queue time: %f" % (time.time()-start1))
            # exit()
            #from tensorflow.python.profiler import model_analyzer, option_builder
            #my_profiler = model_analyzer.Profiler(graph=sess.graph)
            #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            #run_metadata = tf.RunMetadata()

            start2=time.time()
            #sim_mat_summary, training_loss_summary, training_loss, _ = sess.run([model.sim_mat_summary, model.total_loss_summary, model.total_loss, model.optimize], feed_dict={model.input_batch: batch[0], model.target_batch : batch[1]},  options=run_options, run_metadata=run_metadata)
            # import pdb;pdb.set_trace()
            sim_mat_summary, training_loss_summary, training_loss, up = \
                sess.run([model.sim_mat_summary, model.total_loss_summary, model.total_loss, model.optimize],
                         feed_dict={model.input_batch: batch[0], model.target_batch : batch[1]})
            #training_loss, _ = sess.run([model.total_loss, model.optimize], feed_dict={model.input_batch: batch[0], model.target_batch : batch[1]})
            #import pdb;pdb.set_trace()
            print("sess run time: %f" % (time.time()-start2))

            ## Create the Timeline object, and write it to a json
            #tl = timeline.Timeline(run_metadata.step_stats)
            #ctf = tl.generate_chrome_trace_format()
            #with open('timeline.json', 'w') as f:
            #    f.write(ctf)

            #my_profiler.add_step(step=num_step, run_meta=run_metadata)
            #profile_op_builder = option_builder.ProfileOptionBuilder( )
            ## sort by time taken
            #profile_op_builder.select(['micros', 'occurrence'])
            #profile_op_builder.order_by('micros')
            #profile_op_builder.with_max_depth(20) # can be any large number
            #profile_op_builder.with_file_output('profile.log') # can be any large number
            #my_profiler.profile_name_scope(profile_op_builder.build())

            train_writer.add_summary(sim_mat_summary, num_step)
            train_writer.add_summary(training_loss_summary, num_step)
            print("batch loss:" + str(training_loss))

            # if num_step % args.checkpoint_freq == 0:
            #     save_path = saver.save(sess, args.ckpt_dir+"/model.ckpt", global_step=model.global_step)
            #     print("model saved in file: %s / %d th step" % (save_path, sess.run(model.global_step)))

            #if num_step % args.eval_freq == 0:
            #    batch = eval_queue.get()
            #    eval_summary, eval_loss_summary, eval_loss = sess.run([model.eval_sim_mat_summary, model.eval_total_loss_summary, model.eval_total_loss], feed_dict={model.input_batch: batch[0], model.target_batch : batch[1]})
            #    train_writer.add_summary(eval_sim_mat_summary, num_step)
            #    train_writer.add_summary(eval_loss_summary, num_step)
        print("last step elapsed time: %f" % (time.time()-start))