def main(): args = arg_parse().parse_args() audio_reader = AudioReader( input_audio_dir="deep-speaker-data/VCTK-Corpus/", output_cache_dir="deep-speaker-data/cache", sample_rate=c.AUDIO.SAMPLE_RATE, multi_threading=args.multi_threading) if args.regenerate_full_cache: start_regenerate = time.time() regenerate_full_cache(audio_reader, args) end_regenerate = time.time() print("The time of regeneration is {}".format(end_regenerate - start_regenerate)) exit(1) if args.generate_training_inputs: start_inputs = time.time() generate_cache_from_training_inputs(audio_reader, args) end_inputs = time.time() print("The time of regeneration inputs is {}".format(end_inputs - start_inputs)) exit(1) if args.update_cache: start_update = time.time() audio_reader.build_new_cache() end_update = time.time() print("The time of regeneration inputs is {}".format(end_update - start_update)) exit(1)
def main(): args = arg_parse().parse_args() audio_reader = AudioReader(input_audio_dir=args.audio_dir, output_cache_dir=args.cache_output_dir, sample_rate=c.AUDIO.SAMPLE_RATE, multi_threading=args.multi_threading) if args.regenerate_full_cache: regenerate_full_cache(audio_reader, args) exit(1) if args.update_cache: audio_reader.build_cache() exit(1) if args.generate_training_inputs: generate_cache_from_training_inputs(audio_reader, args) exit(1) if args.unseen_speakers is not None: unseen_speakers = [x.strip() for x in args.unseen_speakers.split(',')] from unseen_speakers import inference_unseen_speakers inference_unseen_speakers(audio_reader, unseen_speakers[0], unseen_speakers[1]) exit(1) if args.get_embeddings is not None: speaker_id = args.get_embeddings.strip() from unseen_speakers import inference_embeddings inference_embeddings(audio_reader, speaker_id) exit(1)
def main(): args = get_arguments() try: directories = validate_directories(args) except ValueError as e: print("Some arguments are wrong:") print(str(e)) return logdir = directories['logdir'] restore_from = directories['restore_from'] # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. is_overwritten_training = logdir != restore_from with open(args.wavenet_params, 'r') as f: wavenet_params = json.load(f) # Create coordinator. coord = tf.train.Coordinator() # Load raw waveform from VCTK corpus. with tf.name_scope('create_inputs'): # Allow silence trimming to be skipped by specifying a threshold near # zero. silence_threshold = args.silence_threshold if args.silence_threshold > EPSILON else None gc_enabled = args.gc_channels is not None reader = AudioReader( args.data_dir, coord, sample_rate=wavenet_params['sample_rate'], gc_enabled=gc_enabled, receptive_field=WaveNetModel.calculate_receptive_field( wavenet_params['filter_width'], wavenet_params['dilations'], wavenet_params['scalar_input'], wavenet_params['initial_filter_width']), sample_size=args.sample_size, silence_threshold=silence_threshold) audio_batch = reader.dequeue(args.batch_size) if gc_enabled: gc_id_batch = reader.dequeue_gc(args.batch_size) else: gc_id_batch = None
def generate(): args = get_script_arguments() if args.speakers_sub_list is None or len(args.speakers_sub_list) == 0: speakers_sub_list = None else: speakers_sub_list = args.speakers_sub_list.split(',') AudioReader(audio_dir=args.audio_dir, sample_rate=args.sample_rate, cache_dir=args.output_dir, speakers_sub_list=speakers_sub_list)
def load_wav(self, coord): if self.data_dir: data_dir = self.data_dir else: data_dir = os.path.join("./data", self.dataset_name) EPSILON = 0.001 silence_threshold = self.audio_params['silence_threshold'] if self.audio_params['silence_threshold'] > \ EPSILON else None reader = AudioReader( data_dir, coord, sample_rate=self.audio_params['sample_rate'], sample_length=self.sample_length, silence_threshold=silence_threshold) return reader
def main(): # Read arguments to use multi threading, define the output directory for the cache and audio directories args = arg_parse().parse_args() # Create an audio reader that returns a cache (dictionary) and metadata (dictionary) audio_reader = AudioReader(input_audio_dir=args.audio_dir, output_cache_dir=args.cache_output_dir, sample_rate=c.AUDIO.SAMPLE_RATE, multi_threading=args.multi_threading) # Generate cache for the audio files. Caching usually involves sampling the WAV files at 8KHz and trimming the silences. regenerate_full_cache(audio_reader, args) # Generate inputs used in the softmax training, MFCC windows randomly sampled from the audio cached files and put in a unified pickle file. generate_cache_from_training_inputs(audio_reader, args)
def main(): args = arg_parse().parse_args() audio_reader = AudioReader(input_audio_dir="deep-speaker-data/VCTK-Corpus", output_cache_dir="deep-speaker-data/cache/", sample_rate=c.AUDIO.SAMPLE_RATE) if args.get_embeddings is not None: start_get = time.time() speaker_id = args.get_embeddings.strip() from unseen_speakers import inference_embeddings inference_embeddings(audio_reader, speaker_id) end_get = time.time() print("The time of regeneration inputs is {}".format(end_get - start_get)) exit(1)
def main(): args = arg_parse().parse_args() if args.extra_speakers: input_audio_dir = "samples/" else: input_audio_dir = "deep-speaker-data/VCTK-Corpus" audio_reader = AudioReader(input_audio_dir=input_audio_dir, output_cache_dir="deep-speaker-data/cache", sample_rate=c.AUDIO.SAMPLE_RATE) if args.unseen_speakers is not None: start_unseen = time.time() unseen_speakers = [x.strip() for x in args.unseen_speakers.split(',')] from unseen_speakers import inference_unseen_speakers inference_unseen_speakers(audio_reader, unseen_speakers[0], unseen_speakers[1]) end_unseen = time.time() print("The time of regeneration inputs is {}".format(end_unseen - start_unseen)) exit(1)
# Some configs num_features = 13 # Accounting the 0th index + space + blank label = 28 characters num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 10000 num_hidden = 100 num_layers = 1 batch_size = 1 num_examples = 1 num_batches_per_epoch = int(num_examples / batch_size) audio = AudioReader(audio_dir=c.AUDIO.VCTK_CORPUS_PATH, sample_rate=c.AUDIO.SAMPLE_RATE) file_logger = FileLogger('out.tsv', [ 'curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler', 'random_shift' ]) def run_ctc(): graph = tf.Graph() with graph.as_default(): # e.g: log filter bank or MFCC features # Has size [batch_size, max_step_size, num_features], but the # batch_size and max_step_size can vary along each step inputs = tf.placeholder(tf.float32, [None, None, num_features])
# Some configs num_features = 13 # log filter bank or MFCC features # Accounting the 0th index + space + blank label = 28 characters num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 100 num_hidden = 256 batch_size = 16 num_examples = 1 num_batches_per_epoch = 10 # make sure the values match the ones in generate_audio_cache.py audio = AudioReader(audio_dir=None, cache_dir='cache', sample_rate=sample_rate) file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler']) def next_batch(bs=batch_size, train=True): x_batch = [] y_batch = [] seq_len_batch = [] original_batch = [] for k in range(bs): ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()]) utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1)) test_index = 15 if train:
# Hyper-parameters num_epochs = 10000 num_hidden = 256 batch_size = 16 num_examples = 1 num_batches_per_epoch = 10 # model path modelpath = 'models' modelname = 'ctc' # make sure the values match the ones in generate_audio_cache.py audio = AudioReader(audio_dir='train', cache_dir='cache', sample_rate=sample_rate) file_logger = FileLogger( 'out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler']) def next_batch(bs=batch_size, train=True): x_batch = [] y_batch = [] seq_len_batch = [] original_batch = [] for k in range(bs): ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
def generate(): speakers_sub_list = ['p225'] AudioReader(audio_dir=c.AUDIO.VCTK_CORPUS_PATH, sample_rate=c.AUDIO.SAMPLE_RATE, speakers_sub_list=speakers_sub_list)
# Some configs num_features = 13 # log filter bank or MFCC features # Accounting the 0th index + space + blank label = 28 characters num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 100 num_hidden = 256 batch_size = 16 num_examples = 1 num_batches_per_epoch = 10 # make sure the values match the ones in generate_audio_cache.py audio = AudioReader(audio_dir=None, cache_dir='/content/drive/My Drive/train_cache', sample_rate=sample_rate) file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler']) def next_batch(bs=batch_size, train=True): x_batch = [] y_batch = [] seq_len_batch = [] original_batch = [] for k in range(bs): ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()]) utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1)) test_index = 15 if train:
sample_rate = 16000 # Some configs num_features = 78 # log filter bank or MFCC features # Accounting the 0th index + space + blank label = 28 characters num_classes = ord('z') - ord('a') + 1 + 1 + 1 # Hyper-parameters num_epochs = 1 num_hidden = 1024 batch_size = 346 num_examples = 1 num_batches_per_epoch = 1 # make sure the values match the ones in generate_audio_cache.py audio = AudioReader(audio_dir='test', cache_dir='cache_test', sample_rate=sample_rate) def next_batch(bs=batch_size, train=True): x_batch = [] y_batch = [] seq_len_batch = [] original_batch = [] i = 0 for k in range(bs): ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()]) utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1)) test_index = 346 if train:
with tf.name_scope('create_inputs'): # Allow silence trimming to be skipped by specifying a threshold near # zero. silence_threshold = None #AUDIO_FILE_PATH = '/home/sriramso/data/VCTK-Corpus' AUDIO_FILE_PATH = '/home/andrewszot/VCTK-Corpus' #AUDIO_FILE_PATH = '/Users/andrewszot/Downloads/VCTK-Corpus' gc_enabled = False reader = AudioReader( AUDIO_FILE_PATH, coord, sample_rate=wavenet_params['sample_rate'], gc_enabled=gc_enabled, receptive_field=calculate_receptive_field(wavenet_params["filter_width"], wavenet_params["dilations"], wavenet_params["scalar_input"], wavenet_params["initial_filter_width"]), sample_size=39939, silence_threshold=silence_threshold) audio_batch = reader.dequeue(1) if gc_enabled: gc_id_batch = reader.dequeue_gc(1) else: gc_id_batch = None global_step = tf.Variable(0, trainable=False) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
def main(): args = get_arguments() try: directories = validate_directories(args) except ValueError as e: print("Some arguments are wrong:") print(str(e)) return logdir = directories['logdir'] restore_from = directories['restore_from'] # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. is_overwritten_training = logdir != restore_from restore_params = os.path.join(restore_from, 'config.yaml') try: with open(restore_params) as f: scrygan_params = yaml.load(f) except IOError: print("no restore") with open('default_params.yaml', 'r') as f: scrygan_params = yaml.load(f) try: if args.params: with open(args.params, 'r') as f: scrygan_params.update(yaml.load(f)) except IOError: print("No params file found, using defaults.") print("Loaded params: {}".format(yaml.dump(scrygan_params))) batch_size = scrygan_params["batch_size"] sample_rate = 16000 sample_size = scrygan_params["sample_size"] overlap_size = scrygan_params["overlap_size"] save_interval = scrygan_params["save_interval"] fast_z = scrygan_params["fast_z"] num_t = scrygan_params["num_t"] print("sample_size: {}".format(sample_size)) num_steps = scrygan_params["num_steps"] with tf.name_scope('create_inputs'): reader = AudioReader(args.data_dir, batch_size=batch_size, sample_size=sample_size, overlap_size=overlap_size, num_t=num_t) model = ScryGanModel(batch_size=batch_size, sample_size=sample_size, **scrygan_params["model"]) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) print('discriminator shape: {}'.format(model.D.shape)) print('d_loss shape: {}'.format(model.d_loss.shape)) d_optim = tf.train.AdamOptimizer(scrygan_params["d_learning_rate"], beta1=0.5).minimize(model.d_loss, var_list=model.d_vars) print('generator shape: {}'.format(model.G.shape)) print('g_loss shape: {}'.format(model.g_loss.shape)) g_optim = tf.train.AdamOptimizer(scrygan_params["g_learning_rate"], beta1=0.5).minimize(model.g_loss, var_list=model.g_vars) init = tf.global_variables_initializer() sess.run(init) model.g_sum = tf.summary.merge( [model.z_sum, model.d__sum, model.d_loss_fake_sum, model.g_loss_sum]) model.d_sum = tf.summary.merge( [model.z_sum, model.d_sum, model.d_loss_real_sum, model.d_loss_sum]) writer = tf.summary.FileWriter(logdir, sess.graph) saver = tf.train.Saver(var_list=tf.trainable_variables()) text_file = open(os.path.join(logdir, "config.yaml"), "w") text_file.write(yaml.dump(scrygan_params)) text_file.close() saved_global_step = -1 try: saved_global_step = load(saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except: print("Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise step = None last_saved_step = saved_global_step #profiler = tf.profiler.Profiler(sess.graph) print("Seconds scanned per audio file: {:.1f}".format(sample_size / sample_rate)) try: for step in range(saved_global_step + 1, num_steps): batch = reader.get_batch() start_time = time.time() spectrograms = [] for idx, full_audio in enumerate(batch): audio_sequence = [] for t in range(0, int(num_t / 2)): start = t * sample_size - overlap_size * t audio = full_audio[start:start + sample_size] _, _, Sxx = signal.spectrogram(audio, 16000, nperseg=256, nfft=256) Sxx = misc.imresize(Sxx, (128, 128)) audio_sequence.append(Sxx[0:64, 0:64]) audio_sequence.append(Sxx[0:64, 64:]) spectrograms.append(audio_sequence) #spectrograms = np.array(spectrograms) spectrograms = np.array(spectrograms) / 256.0 g_state = model.g_zero_state() d_state = model.d_zero_state() d_state_ = model.d_zero_state() slow_z = model.z_dim - fast_z slow_z_batch = np.random.uniform( -1, 1, [model.batch_size, slow_z]).astype(np.float32) do_sampling = np.mod(step, save_interval) == 0 samples = [] for t in range(num_t): if fast_z == 0: batch_z = slow_z_batch elif slow_z == 0: fast_z_batch = np.random.uniform( -1, 1, [model.batch_size, fast_z]).astype(np.float32) batch_z = fast_z_batch else: fast_z_batch = np.random.uniform( -1, 1, [model.batch_size, fast_z]).astype(np.float32) batch_z = np.concatenate([slow_z_batch, fast_z_batch], axis=1) #print("spectograms.shape: {}".format(spectrograms.shape)) t_batch = spectrograms[:, t] #print("t_batch.shape: {}".format(t_batch.shape)) raw_audio_batch = np.array(t_batch) raw_audio_batch = np.expand_dims(raw_audio_batch, axis=-1) # Update network feed_dict = {model.inputs: raw_audio_batch, model.z: batch_z} model.d_load_placeholders(model.D, feed_dict, d_state) model.d_load_placeholders(model.D_, feed_dict, d_state_) model.g_load_placeholders(model.G, feed_dict, g_state) _, _, errD_fake, errD_real, errG, d_summary_str, g_summary_str, d_state, d_state_, g_state, t_samples = sess.run( [ d_optim, g_optim, model.d_loss_fake, model.d_loss_real, model.g_loss, model.d_sum, model.g_sum, model.state_out[model.D], model.state_out[model.D_], model.state_out[model.G], model.G if do_sampling else model.g_sum ], feed_dict=feed_dict) writer.add_summary(d_summary_str, step) writer.add_summary(g_summary_str, step) samples.append(t_samples) if do_sampling: save(saver, sess, logdir, step) last_saved_step = step real_images = [] for idx in range(24): for t in range(6): real_images.append(spectrograms[idx, t, :, :]) save_images( np.array(real_images).reshape([144, 64, 64, 1]), (12, 12), os.path.join(logdir, 'real_{:04d}.png'.format(step))) print("real sample saved") generator_images = [] for idx in range(24): for t in range(6): generator_images.append(samples[t][idx]) generator_images = np.array(generator_images).reshape( [144, 64, 64, 1]) save_images( generator_images, (12, 12), os.path.join(logdir, 'generator_{:04d}.png'.format(step))) print("Epoch: [%03d] time: %4.4f, d_loss: %.8f, g_loss: %.8f" \ % (step, time.time() - start_time, errD_fake+errD_real, errG)) except KeyboardInterrupt: print() finally: pass
def main(): args = get_arguments() try: directories = validate_directories(args) except ValueError as e: print("Some arguments are wrong:") print(str(e)) return logdir = directories['logdir'] logdir_root = directories['logdir_root'] restore_from = directories['restore_from'] # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. is_overwritten_training = logdir != restore_from with open(args.wavenet_params, 'r') as f: wavenet_params = json.load(f) # Create coordinator. coord = tf.train.Coordinator() # Load raw waveform from VCTK corpus. with tf.name_scope('create_inputs'): reader = AudioReader(args.data_dir, coord, sample_rate=wavenet_params['sample_rate'], sample_size=args.sample_size) audio_batch = reader.dequeue(args.batch_size) # Create network. net = WaveNet( batch_size=args.batch_size, dilations=wavenet_params["dilations"], filter_width=wavenet_params["filter_width"], residual_channels=wavenet_params["residual_channels"], dilation_channels=wavenet_params["dilation_channels"], skip_channels=wavenet_params["skip_channels"], quantization_channels=wavenet_params["quantization_channels"], use_biases=wavenet_params["use_biases"]) loss = net.loss(audio_batch) optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) trainable = tf.trainable_variables() optim = optimizer.minimize(loss, var_list=trainable) # Set up logging for TensorBoard. writer = tf.train.SummaryWriter(logdir) writer.add_graph(tf.get_default_graph()) run_metadata = tf.RunMetadata() summaries = tf.merge_all_summaries() # Set up session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) init = tf.initialize_all_variables() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver() try: saved_global_step = load(saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except: print("Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise threads = tf.train.start_queue_runners(sess=sess, coord=coord) reader.start_threads(sess) try: last_saved_step = saved_global_step for step in range(saved_global_step + 1, args.num_steps): start_time = time.time() if args.store_metadata and step % 50 == 0: # Slow run that stores extra information for debugging. print('Storing metadata') run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) summary, loss_value, _ = sess.run([summaries, loss, optim], options=run_options, run_metadata=run_metadata) writer.add_summary(summary, step) writer.add_run_metadata(run_metadata, 'step_{:04d}'.format(step)) tl = timeline.Timeline(run_metadata.step_stats) timeline_path = os.path.join(logdir, 'timeline.trace') with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace_format(show_memory=True)) else: summary, loss_value, _ = sess.run([summaries, loss, optim]) writer.add_summary(summary, step) duration = time.time() - start_time print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) if step % 50 == 0: save(saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() finally: if step > last_saved_step: save(saver, sess, logdir, step) coord.request_stop() coord.join(threads)
keys = list(d.keys()) for i in range(len(keys) - 1): key1 = keys[i] for key2 in keys[i + 1:]: print(f'{key1} vs {key2} = {cosine(d[key1], d[key2])}') # print(f'Philip vs Philip2 = {cosine(philip_embed, philip_embed2)}') # print(f'Philip vs p225 = {cosine(philip_embed, p225_embed)}') # print(f'Philip vs p225 last = {cosine(philip_embed, p225_embed_last)}') # print(f'p225 first four vs p225 last four = {cosine(p225_embed, p225_embed_last)}') import pdb pdb.set_trace() audio_reader = AudioReader(input_audio_dir=input_audio_dir, output_cache_dir=cache_dir, sample_rate=c.AUDIO.SAMPLE_RATE, multi_threading=True) #audio_reader.build_cache() # print(audio_reader.all_speaker_ids) # import pdb # pdb.set_trace() #regenerate_full_cache(audio_reader, cache_dir) #unseen_speakers = ['p225', 'PhilippeRemy'] #inference_unseen_speakers(audio_reader, 'p225', 'PhilippeRemy') #speaker_id = 'p225' #from unseen_speakers import inference_embeddings #inference_embeddings(audio_reader, 'PhilippeRemy')