Exemple #1
0
def main():
    args = arg_parse().parse_args()

    audio_reader = AudioReader(
        input_audio_dir="deep-speaker-data/VCTK-Corpus/",
        output_cache_dir="deep-speaker-data/cache",
        sample_rate=c.AUDIO.SAMPLE_RATE,
        multi_threading=args.multi_threading)

    if args.regenerate_full_cache:
        start_regenerate = time.time()
        regenerate_full_cache(audio_reader, args)
        end_regenerate = time.time()
        print("The time of regeneration is {}".format(end_regenerate -
                                                      start_regenerate))
        exit(1)

    if args.generate_training_inputs:
        start_inputs = time.time()
        generate_cache_from_training_inputs(audio_reader, args)
        end_inputs = time.time()
        print("The time of regeneration inputs is {}".format(end_inputs -
                                                             start_inputs))
        exit(1)

    if args.update_cache:
        start_update = time.time()
        audio_reader.build_new_cache()
        end_update = time.time()
        print("The time of regeneration inputs is {}".format(end_update -
                                                             start_update))
        exit(1)
Exemple #2
0
def main():
    args = arg_parse().parse_args()

    audio_reader = AudioReader(input_audio_dir=args.audio_dir,
                               output_cache_dir=args.cache_output_dir,
                               sample_rate=c.AUDIO.SAMPLE_RATE,
                               multi_threading=args.multi_threading)

    if args.regenerate_full_cache:
        regenerate_full_cache(audio_reader, args)
        exit(1)

    if args.update_cache:
        audio_reader.build_cache()
        exit(1)

    if args.generate_training_inputs:
        generate_cache_from_training_inputs(audio_reader, args)
        exit(1)

    if args.unseen_speakers is not None:
        unseen_speakers = [x.strip() for x in args.unseen_speakers.split(',')]
        from unseen_speakers import inference_unseen_speakers
        inference_unseen_speakers(audio_reader, unseen_speakers[0],
                                  unseen_speakers[1])
        exit(1)

    if args.get_embeddings is not None:
        speaker_id = args.get_embeddings.strip()
        from unseen_speakers import inference_embeddings
        inference_embeddings(audio_reader, speaker_id)
        exit(1)
Exemple #3
0
def main():
    args = get_arguments()

    try:
        directories = validate_directories(args)
    except ValueError as e:
        print("Some arguments are wrong:")
        print(str(e))
        return

    logdir = directories['logdir']
    restore_from = directories['restore_from']

    # Even if we restored the model, we will treat it as new training
    # if the trained model is written into an arbitrary location.
    is_overwritten_training = logdir != restore_from

    with open(args.wavenet_params, 'r') as f:
        wavenet_params = json.load(f)

    # Create coordinator.
    coord = tf.train.Coordinator()

    # Load raw waveform from VCTK corpus.
    with tf.name_scope('create_inputs'):
        # Allow silence trimming to be skipped by specifying a threshold near
        # zero.
        silence_threshold = args.silence_threshold if args.silence_threshold > EPSILON else None

        gc_enabled = args.gc_channels is not None
        reader = AudioReader(
            args.data_dir,
            coord,
            sample_rate=wavenet_params['sample_rate'],
            gc_enabled=gc_enabled,
            receptive_field=WaveNetModel.calculate_receptive_field(
                wavenet_params['filter_width'], wavenet_params['dilations'],
                wavenet_params['scalar_input'],
                wavenet_params['initial_filter_width']),
            sample_size=args.sample_size,
            silence_threshold=silence_threshold)
        audio_batch = reader.dequeue(args.batch_size)
        if gc_enabled:
            gc_id_batch = reader.dequeue_gc(args.batch_size)
        else:
            gc_id_batch = None
Exemple #4
0
def generate():
    args = get_script_arguments()
    if args.speakers_sub_list is None or len(args.speakers_sub_list) == 0:
        speakers_sub_list = None
    else:
        speakers_sub_list = args.speakers_sub_list.split(',')
    AudioReader(audio_dir=args.audio_dir,
                sample_rate=args.sample_rate,
                cache_dir=args.output_dir,
                speakers_sub_list=speakers_sub_list)
Exemple #5
0
 def load_wav(self, coord):
     if self.data_dir:
         data_dir = self.data_dir
     else:
         data_dir = os.path.join("./data", self.dataset_name)
     EPSILON = 0.001
     silence_threshold = self.audio_params['silence_threshold'] if self.audio_params['silence_threshold'] > \
                                                   EPSILON else None
     reader = AudioReader(
         data_dir,
         coord,
         sample_rate=self.audio_params['sample_rate'],
         sample_length=self.sample_length,
         silence_threshold=silence_threshold)
     return reader
Exemple #6
0
def main():

    # Read arguments to use multi threading, define the output directory for the cache and audio directories
    args = arg_parse().parse_args()

    # Create an audio reader that returns a cache (dictionary) and metadata (dictionary)
    audio_reader = AudioReader(input_audio_dir=args.audio_dir,
                               output_cache_dir=args.cache_output_dir,
                               sample_rate=c.AUDIO.SAMPLE_RATE,
                               multi_threading=args.multi_threading)

    # Generate cache for the audio files. Caching usually involves sampling the WAV files at 8KHz and trimming the silences. 
    regenerate_full_cache(audio_reader, args)

    # Generate inputs used in the softmax training, MFCC windows randomly sampled from the audio cached files and put in a unified pickle file.
    generate_cache_from_training_inputs(audio_reader, args)
Exemple #7
0
def main():
    args = arg_parse().parse_args()

    audio_reader = AudioReader(input_audio_dir="deep-speaker-data/VCTK-Corpus",
                               output_cache_dir="deep-speaker-data/cache/",
                               sample_rate=c.AUDIO.SAMPLE_RATE)

    if args.get_embeddings is not None:
        start_get = time.time()
        speaker_id = args.get_embeddings.strip()
        from unseen_speakers import inference_embeddings
        inference_embeddings(audio_reader, speaker_id)
        end_get = time.time()
        print("The time of regeneration inputs is {}".format(end_get -
                                                             start_get))
        exit(1)
Exemple #8
0
def main():
    args = arg_parse().parse_args()

    if args.extra_speakers:
        input_audio_dir = "samples/"
    else:
        input_audio_dir = "deep-speaker-data/VCTK-Corpus"

    audio_reader = AudioReader(input_audio_dir=input_audio_dir,
                               output_cache_dir="deep-speaker-data/cache",
                               sample_rate=c.AUDIO.SAMPLE_RATE)

    if args.unseen_speakers is not None:
        start_unseen = time.time()
        unseen_speakers = [x.strip() for x in args.unseen_speakers.split(',')]
        from unseen_speakers import inference_unseen_speakers
        inference_unseen_speakers(audio_reader, unseen_speakers[0],
                                  unseen_speakers[1])
        end_unseen = time.time()
        print("The time of regeneration inputs is {}".format(end_unseen -
                                                             start_unseen))
        exit(1)
# Some configs
num_features = 13
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 10000
num_hidden = 100
num_layers = 1
batch_size = 1

num_examples = 1
num_batches_per_epoch = int(num_examples / batch_size)

audio = AudioReader(audio_dir=c.AUDIO.VCTK_CORPUS_PATH,
                    sample_rate=c.AUDIO.SAMPLE_RATE)

file_logger = FileLogger('out.tsv', [
    'curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler',
    'random_shift'
])


def run_ctc():
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_step_size, num_features], but the
        # batch_size and max_step_size can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features])
# Some configs
num_features = 13  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 100
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir=None,
                    cache_dir='cache',
                    sample_rate=sample_rate)

file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 15
        if train:
# Hyper-parameters
num_epochs = 10000
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

# model path
modelpath = 'models'
modelname = 'ctc'

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir='train',
                    cache_dir='cache',
                    sample_rate=sample_rate)

file_logger = FileLogger(
    'out.tsv',
    ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target']))
                               for (k, v) in audio.cache.items()])
def generate():
    speakers_sub_list = ['p225']
    AudioReader(audio_dir=c.AUDIO.VCTK_CORPUS_PATH,
                sample_rate=c.AUDIO.SAMPLE_RATE,
                speakers_sub_list=speakers_sub_list)
# Some configs
num_features = 13  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 100
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir=None,
                    cache_dir='/content/drive/My Drive/train_cache',
                    sample_rate=sample_rate)

file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 15
        if train:
Exemple #14
0
sample_rate = 16000
# Some configs
num_features = 78  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 1
num_hidden = 1024
batch_size = 346
num_examples = 1
num_batches_per_epoch = 1

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir='test',
                    cache_dir='cache_test',
                    sample_rate=sample_rate)


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    i = 0
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target']))
                               for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 346
        if train:
with tf.name_scope('create_inputs'):
    # Allow silence trimming to be skipped by specifying a threshold near
    # zero.
    silence_threshold = None

    #AUDIO_FILE_PATH = '/home/sriramso/data/VCTK-Corpus'
    AUDIO_FILE_PATH = '/home/andrewszot/VCTK-Corpus'
    #AUDIO_FILE_PATH = '/Users/andrewszot/Downloads/VCTK-Corpus'

    gc_enabled = False
    reader = AudioReader(
        AUDIO_FILE_PATH,
        coord,
        sample_rate=wavenet_params['sample_rate'],
        gc_enabled=gc_enabled,
        receptive_field=calculate_receptive_field(wavenet_params["filter_width"],
                                                               wavenet_params["dilations"],
                                                               wavenet_params["scalar_input"],
                                                               wavenet_params["initial_filter_width"]),
        sample_size=39939,
        silence_threshold=silence_threshold)

    audio_batch = reader.dequeue(1)
    if gc_enabled:
        gc_id_batch = reader.dequeue_gc(1)
    else:
        gc_id_batch = None

global_step = tf.Variable(0, trainable=False)

sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
Exemple #16
0
def main():
    args = get_arguments()

    try:
        directories = validate_directories(args)
    except ValueError as e:
        print("Some arguments are wrong:")
        print(str(e))
        return

    logdir = directories['logdir']
    restore_from = directories['restore_from']

    # Even if we restored the model, we will treat it as new training
    # if the trained model is written into an arbitrary location.
    is_overwritten_training = logdir != restore_from

    restore_params = os.path.join(restore_from, 'config.yaml')
    try:
        with open(restore_params) as f:
            scrygan_params = yaml.load(f)
    except IOError:
        print("no restore")
        with open('default_params.yaml', 'r') as f:
            scrygan_params = yaml.load(f)
        try:
            if args.params:
                with open(args.params, 'r') as f:
                    scrygan_params.update(yaml.load(f))
        except IOError:
            print("No params file found, using defaults.")
    print("Loaded params: {}".format(yaml.dump(scrygan_params)))

    batch_size = scrygan_params["batch_size"]
    sample_rate = 16000
    sample_size = scrygan_params["sample_size"]
    overlap_size = scrygan_params["overlap_size"]
    save_interval = scrygan_params["save_interval"]
    fast_z = scrygan_params["fast_z"]
    num_t = scrygan_params["num_t"]
    print("sample_size: {}".format(sample_size))
    num_steps = scrygan_params["num_steps"]
    with tf.name_scope('create_inputs'):
        reader = AudioReader(args.data_dir,
                             batch_size=batch_size,
                             sample_size=sample_size,
                             overlap_size=overlap_size,
                             num_t=num_t)
    model = ScryGanModel(batch_size=batch_size,
                         sample_size=sample_size,
                         **scrygan_params["model"])

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    print('discriminator shape: {}'.format(model.D.shape))
    print('d_loss shape: {}'.format(model.d_loss.shape))
    d_optim = tf.train.AdamOptimizer(scrygan_params["d_learning_rate"],
                                     beta1=0.5).minimize(model.d_loss,
                                                         var_list=model.d_vars)
    print('generator shape: {}'.format(model.G.shape))
    print('g_loss shape: {}'.format(model.g_loss.shape))
    g_optim = tf.train.AdamOptimizer(scrygan_params["g_learning_rate"],
                                     beta1=0.5).minimize(model.g_loss,
                                                         var_list=model.g_vars)
    init = tf.global_variables_initializer()
    sess.run(init)
    model.g_sum = tf.summary.merge(
        [model.z_sum, model.d__sum, model.d_loss_fake_sum, model.g_loss_sum])
    model.d_sum = tf.summary.merge(
        [model.z_sum, model.d_sum, model.d_loss_real_sum, model.d_loss_sum])
    writer = tf.summary.FileWriter(logdir, sess.graph)
    saver = tf.train.Saver(var_list=tf.trainable_variables())
    text_file = open(os.path.join(logdir, "config.yaml"), "w")
    text_file.write(yaml.dump(scrygan_params))
    text_file.close()

    saved_global_step = -1
    try:
        saved_global_step = load(saver, sess, restore_from)
        if is_overwritten_training or saved_global_step is None:
            # The first training step will be saved_global_step + 1,
            # therefore we put -1 here for new or overwritten trainings.
            saved_global_step = -1
    except:
        print("Something went wrong while restoring checkpoint. "
              "We will terminate training to avoid accidentally overwriting "
              "the previous model.")
        raise

    step = None
    last_saved_step = saved_global_step
    #profiler = tf.profiler.Profiler(sess.graph)
    print("Seconds scanned per audio file: {:.1f}".format(sample_size /
                                                          sample_rate))
    try:
        for step in range(saved_global_step + 1, num_steps):
            batch = reader.get_batch()
            start_time = time.time()
            spectrograms = []
            for idx, full_audio in enumerate(batch):
                audio_sequence = []
                for t in range(0, int(num_t / 2)):
                    start = t * sample_size - overlap_size * t
                    audio = full_audio[start:start + sample_size]
                    _, _, Sxx = signal.spectrogram(audio,
                                                   16000,
                                                   nperseg=256,
                                                   nfft=256)
                    Sxx = misc.imresize(Sxx, (128, 128))
                    audio_sequence.append(Sxx[0:64, 0:64])
                    audio_sequence.append(Sxx[0:64, 64:])
                spectrograms.append(audio_sequence)
            #spectrograms = np.array(spectrograms)
            spectrograms = np.array(spectrograms) / 256.0
            g_state = model.g_zero_state()
            d_state = model.d_zero_state()
            d_state_ = model.d_zero_state()
            slow_z = model.z_dim - fast_z
            slow_z_batch = np.random.uniform(
                -1, 1, [model.batch_size, slow_z]).astype(np.float32)
            do_sampling = np.mod(step, save_interval) == 0
            samples = []
            for t in range(num_t):
                if fast_z == 0:
                    batch_z = slow_z_batch
                elif slow_z == 0:
                    fast_z_batch = np.random.uniform(
                        -1, 1, [model.batch_size, fast_z]).astype(np.float32)
                    batch_z = fast_z_batch
                else:
                    fast_z_batch = np.random.uniform(
                        -1, 1, [model.batch_size, fast_z]).astype(np.float32)
                    batch_z = np.concatenate([slow_z_batch, fast_z_batch],
                                             axis=1)
                #print("spectograms.shape: {}".format(spectrograms.shape))
                t_batch = spectrograms[:, t]
                #print("t_batch.shape: {}".format(t_batch.shape))
                raw_audio_batch = np.array(t_batch)
                raw_audio_batch = np.expand_dims(raw_audio_batch, axis=-1)

                # Update network
                feed_dict = {model.inputs: raw_audio_batch, model.z: batch_z}
                model.d_load_placeholders(model.D, feed_dict, d_state)
                model.d_load_placeholders(model.D_, feed_dict, d_state_)
                model.g_load_placeholders(model.G, feed_dict, g_state)
                _, _, errD_fake, errD_real, errG, d_summary_str, g_summary_str, d_state, d_state_, g_state, t_samples = sess.run(
                    [
                        d_optim, g_optim, model.d_loss_fake, model.d_loss_real,
                        model.g_loss, model.d_sum, model.g_sum,
                        model.state_out[model.D], model.state_out[model.D_],
                        model.state_out[model.G],
                        model.G if do_sampling else model.g_sum
                    ],
                    feed_dict=feed_dict)
                writer.add_summary(d_summary_str, step)
                writer.add_summary(g_summary_str, step)
                samples.append(t_samples)

            if do_sampling:
                save(saver, sess, logdir, step)
                last_saved_step = step
                real_images = []
                for idx in range(24):
                    for t in range(6):
                        real_images.append(spectrograms[idx, t, :, :])
                save_images(
                    np.array(real_images).reshape([144, 64, 64, 1]), (12, 12),
                    os.path.join(logdir, 'real_{:04d}.png'.format(step)))
                print("real sample saved")
                generator_images = []
                for idx in range(24):
                    for t in range(6):
                        generator_images.append(samples[t][idx])
                generator_images = np.array(generator_images).reshape(
                    [144, 64, 64, 1])
                save_images(
                    generator_images, (12, 12),
                    os.path.join(logdir, 'generator_{:04d}.png'.format(step)))
            print("Epoch: [%03d] time: %4.4f, d_loss: %.8f, g_loss: %.8f" \
                % (step, time.time() - start_time, errD_fake+errD_real, errG))

    except KeyboardInterrupt:
        print()
    finally:
        pass
Exemple #17
0
def main():
    args = get_arguments()

    try:
        directories = validate_directories(args)
    except ValueError as e:
        print("Some arguments are wrong:")
        print(str(e))
        return

    logdir = directories['logdir']
    logdir_root = directories['logdir_root']
    restore_from = directories['restore_from']

    # Even if we restored the model, we will treat it as new training
    # if the trained model is written into an arbitrary location.
    is_overwritten_training = logdir != restore_from

    with open(args.wavenet_params, 'r') as f:
        wavenet_params = json.load(f)

    # Create coordinator.
    coord = tf.train.Coordinator()

    # Load raw waveform from VCTK corpus.
    with tf.name_scope('create_inputs'):
        reader = AudioReader(args.data_dir,
                             coord,
                             sample_rate=wavenet_params['sample_rate'],
                             sample_size=args.sample_size)
        audio_batch = reader.dequeue(args.batch_size)

    # Create network.
    net = WaveNet(
        batch_size=args.batch_size,
        dilations=wavenet_params["dilations"],
        filter_width=wavenet_params["filter_width"],
        residual_channels=wavenet_params["residual_channels"],
        dilation_channels=wavenet_params["dilation_channels"],
        skip_channels=wavenet_params["skip_channels"],
        quantization_channels=wavenet_params["quantization_channels"],
        use_biases=wavenet_params["use_biases"])
    loss = net.loss(audio_batch)
    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    trainable = tf.trainable_variables()
    optim = optimizer.minimize(loss, var_list=trainable)

    # Set up logging for TensorBoard.
    writer = tf.train.SummaryWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    run_metadata = tf.RunMetadata()
    summaries = tf.merge_all_summaries()

    # Set up session
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    init = tf.initialize_all_variables()
    sess.run(init)

    # Saver for storing checkpoints of the model.
    saver = tf.train.Saver()

    try:
        saved_global_step = load(saver, sess, restore_from)
        if is_overwritten_training or saved_global_step is None:
            # The first training step will be saved_global_step + 1,
            # therefore we put -1 here for new or overwritten trainings.
            saved_global_step = -1

    except:
        print("Something went wrong while restoring checkpoint. "
              "We will terminate training to avoid accidentally overwriting "
              "the previous model.")
        raise

    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    reader.start_threads(sess)

    try:
        last_saved_step = saved_global_step
        for step in range(saved_global_step + 1, args.num_steps):
            start_time = time.time()
            if args.store_metadata and step % 50 == 0:
                # Slow run that stores extra information for debugging.
                print('Storing metadata')
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                summary, loss_value, _ = sess.run([summaries, loss, optim],
                                                  options=run_options,
                                                  run_metadata=run_metadata)
                writer.add_summary(summary, step)
                writer.add_run_metadata(run_metadata,
                                        'step_{:04d}'.format(step))
                tl = timeline.Timeline(run_metadata.step_stats)
                timeline_path = os.path.join(logdir, 'timeline.trace')
                with open(timeline_path, 'w') as f:
                    f.write(tl.generate_chrome_trace_format(show_memory=True))
            else:
                summary, loss_value, _ = sess.run([summaries, loss, optim])
                writer.add_summary(summary, step)

            duration = time.time() - start_time
            print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format(
                step, loss_value, duration))

            if step % 50 == 0:
                save(saver, sess, logdir, step)
                last_saved_step = step

    except KeyboardInterrupt:
        # Introduce a line break after ^C is displayed so save message
        # is on its own line.
        print()
    finally:
        if step > last_saved_step:
            save(saver, sess, logdir, step)
        coord.request_stop()
        coord.join(threads)
Exemple #18
0
keys = list(d.keys())
for i in range(len(keys) - 1):
    key1 = keys[i]
    for key2 in keys[i + 1:]:
        print(f'{key1} vs {key2} = {cosine(d[key1], d[key2])}')

# print(f'Philip vs Philip2 = {cosine(philip_embed, philip_embed2)}')
# print(f'Philip vs p225 = {cosine(philip_embed, p225_embed)}')
# print(f'Philip vs p225 last = {cosine(philip_embed, p225_embed_last)}')
# print(f'p225 first four vs p225 last four = {cosine(p225_embed, p225_embed_last)}')

import pdb
pdb.set_trace()

audio_reader = AudioReader(input_audio_dir=input_audio_dir,
                           output_cache_dir=cache_dir,
                           sample_rate=c.AUDIO.SAMPLE_RATE,
                           multi_threading=True)
#audio_reader.build_cache()
# print(audio_reader.all_speaker_ids)
# import pdb
# pdb.set_trace()
#regenerate_full_cache(audio_reader, cache_dir)

#unseen_speakers = ['p225', 'PhilippeRemy']

#inference_unseen_speakers(audio_reader, 'p225', 'PhilippeRemy')

#speaker_id = 'p225'
#from unseen_speakers import inference_embeddings
#inference_embeddings(audio_reader, 'PhilippeRemy')