def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: self._datadir = os.path.dirname(metadata_filename) #with open(metadata_filename, encoding='utf-8') as f: with open('jenie_Processed/amused/train.txt', encoding='utf-8') as f: #for line in f: # self._metadata = line.strip().split('|') self._metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') print(cmudict_path) #cmudict_path = './cmudict-0.7b' if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: self._datadir = os.path.dirname(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape)
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._offset = 0 self._datadir = os.path.dirname(metadata_filename) # train.txt with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue( ) #선입선출 self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape)
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] if self.cache_targets and self._num_cached != len( self._cached_mel_targets): self._num_cached = len(self._cached_mel_targets) log('Cached %d targets' % self._num_cached) # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def wavenet_synthesize(args, hparams, checkpoint): output_dir = 'wavenet_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except AttributeError: #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) if 'Both' in checkpoint: checkpoint = checkpoint.replace('Both', 'Tacotron-2') elif 'Tacotron-2' in checkpoint: checkpoint = checkpoint.replace('Tacotron-2', 'Both') else: #Synthesizing separately raise AssertionError( 'Cannot restore checkpoint: {}, did you train a model?'.format( checkpoint)) try: #Try loading again checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) run_synthesis(args, checkpoint_path, output_dir, hparams)
def initialize(self, input_lengths, linear_targets, ppgs=None, mel_targets=None, speakers=None): with tf.variable_scope('inference') as scope: is_training = ppgs is not None hp = self._hparams # Pre-net: [batch, time, feat] # encoder_steps = tf.gather(input_lengths, tf.argmax(input_lengths)) prenet_outputs = prenet(linear_targets, is_training, hp.prenet_depths) post_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth, 'cbhg_ppgs') # [80->128] logits = tf.layers.dense(post_outputs, hp.num_ppgs, name='pred_ppgs') # [128->80] pred_ppgs = tf.nn.softmax(logits, name='ppgs') self.speakers = speakers self.mel_targets = mel_targets self.linear_targets = linear_targets self.input_lengths = input_lengths self.ppgs = ppgs self.logits = logits self.pred_ppgs = pred_ppgs log('Initialized NNet1 model. Dimensions: ') log(' pred_ppgs: {}'.format(pred_ppgs.shape))
def __init__(self, metadata_filename, hparams): self._hparams = hparams # Load metadata: self._datadir = os.path.dirname(metadata_filename) # with open(metadata_filename) as f: with open(metadata_filename, encoding="utf-8_sig") as f: self._metadata = [line.strip().split('|') for line in f] hours = sum((int(x[2]) for x in self._metadata)) * \ hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def run_eval(args, checkpoint_path, output_dir, sentences, reference_mel): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') assert os.path.normpath(eval_dir) == os.path.normpath( args.mels_dir) #mels_dir = wavenet_input_dir #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) print(sentences) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, reference_mel=reference_mel) with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, text in enumerate(tqdm(sentences)): start = time.time() mel_filename = synth.synthesize(text, i + 1, eval_dir, log_dir, None, reference_mel=reference_mel) file.write('{}|{}\n'.format(text, mel_filename)) log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def update(self, hparams): with tf.variable_scope('inference') as scope: self._hparams = hparams hp = self._hparams (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(self.output_cell, self.helper, self.decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [self.batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training=False, is_updating=True) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq, reuse=True) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments log('Updated Tacotron model.')
def load(self, checkpoint_path, hparams, model_name='WaveNet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=[1, None, hparams.num_mels], name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None with tf.variable_scope('model') as scope: self.model = create_model(model_name, hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=None, synthesis_length=self.synthesis_length) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.reduction_factor if self.static_batches is not None: batches = self.static_batches else: examples = [] for data_dir in self._datadir: if self._hparams.initial_data_greedy: if self._step < self._hparams.initial_phase_step and \ any("krbook" in data_dir for data_dir in self._datadir): data_dir = [ data_dir for data_dir in self._datadir if "krbook" in data_dir ][0] if self._step < self._hparams.initial_phase_step: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group // len(self._datadir)))] else: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] examples.extend(example) examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] self.rng.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict( zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1 examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1
def make_test_batches(self): start = time.time() #Read one example for evaluation n = 1 #Test on entire test set (one sample at an evaluation step) examples = [self._get_test_groups() for i in range(len(self._test_meta))] batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) return batches
def synthesize(args, hparams, gst_checkpoint, wave_checkpoint, sentences, reference_mel): log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name)) log('Synthesizing mel-spectrograms from text..') wavenet_in_dir = gst_synthesize(args, gst_checkpoint, sentences, reference_mel) log('Synthesizing audio from mel-spectrograms.. (This may take a while)') wavenet_synthesize(args, hparams, wave_checkpoint) log('Tacotron-2 TTS synthesis complete!')
def run_synthesis(args, checkpoint_path, output_dir): _p_cmudict = 0.5 GTA = (args.GTA == 'True') if GTA: synth_dir = os.path.join(output_dir, 'gta') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) else: synth_dir = os.path.join(output_dir, 'natural') #Create output path if it doesn't exist os.makedirs(synth_dir, exist_ok=True) metadata_filename = os.path.join(args.input_dir, 'train.txt') if hparams.use_cmudict: cmudict_path = os.path.join(os.path.dirname(metadata_filename), 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception( 'If use_cmudict=True, you must download cmu dictionary first. ' + 'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b' % self._datadir) _cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(_cmudict)) else: _cmudict = None log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, gta=GTA) with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] log('starting synthesis') mel_dir = os.path.join(args.input_dir, 'mels') wav_dir = os.path.join(args.input_dir, 'linear') with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: for i, meta in enumerate(tqdm(metadata)): _punctuation_re = re.compile(r'([\.,"\-_:]+)') text = re.sub(_punctuation_re, r' \1 ', meta[3]) if _cmudict and random.random() < _p_cmudict: text = ' '.join([ maybe_get_arpabet(_cmudict, word) for word in text.split(' ') ]) mel_filename = os.path.join(mel_dir, meta[1]) wav_filename = os.path.join(wav_dir, meta[0]) mel_output_filename = synth.synthesize(text, i + 1, synth_dir, None, mel_filename) file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) log('synthesized mel spectrograms at {}'.format(synth_dir)) return os.path.join(synth_dir, 'map.txt')
def _enqueue_next_group(self): start = time.time() n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i+n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def gst_synthesize(args, checkpoint, sentences=None, reference_mel=None): output_dir = "gst_" + args.output_dir checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' hparams.parse(args.hparams) if args.mode == 'eval': return run_eval(args, checkpoint_path, output_dir, sentences, reference_mel) elif args.mode == 'synthesis': return run_synthesis(args, checkpoint_path, output_dir) else: run_live(args, checkpoint_path)
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.wavenet_batch_size examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples base on similiar output length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def __init__(self, coordinator, metadata_filename, hparams): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata: self._datadir = os.path.dirname(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape) self.linear_targets.set_shape(self._placeholders[3].shape) # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # synthesis (useful for proper nouns, etc.) if hparams.use_cmudict: cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') if not os.path.isfile(cmudict_path): raise Exception('If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i+n] for i in range(0, len(examples), n)] # shape = (batch_size, _batches_per_group, [input, mel, linear, len]) # batches as one batch group random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.batch_size r = self._hparams.outputs_per_step #Test on entire test set examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches, r
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: # sample in batch> x[0]: input_fea, x[1]: lpc_target, x[2]: stop_token_target, x[3]: lpc_target_len. feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [] for data_path in self.data_paths: example = [ self._get_next_example(data_path) for i in range( int(n * _batches_per_group // len(self.data_paths))) ] examples.extend(example) # Bucket examples based on similar output sequence length for efficiency: examples.sort(key=lambda x: x[-1]) batches = [examples[i:i + n] for i in range(0, len(examples), n)] random.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def get_path_dict(data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: paths = glob("{}/*.npz".format(data_dir)) if data_type == 'train': rng.shuffle(paths) if not config.skip_path_filter: items = parallel_run(get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor new_items = [(path, n) for path, n, n_tokens in items \ if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items \ if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths return path_dict
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): with tf.variable_scope('embedding') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=512] with tf.variable_scope('encoder') as scope: x = embedded_inputs for i in range(hp.encoder_stack_size): x = tf.layers.conv1d(x, filters=hp.encoder_conv_filter, kernel_size=hp.encoder_conv_kernel, padding='same', activation=tf.nn.relu) x = tf.layers.batch_normalization(x, training=is_training) lstm_fw = LSTMCell(hp.encoder_lstm_hidden_dim) lstm_bw = LSTMCell(hp.encoder_lstm_hidden_dim) encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw, lstm_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32) # [N, T_in, 512] encoder_output = tf.concat(outputs, axis=2) # with tf.variable_scope('decoder') as scope: self.inputs = inputs self.input_lengths = input_lengths # self.mel_outputs = mel_outputs # self.linear_outputs = linear_outputs # self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_output.shape[-1])
def run_synthesis(args, checkpoint_path, output_dir, hparams): log_dir = os.path.join(output_dir, 'plots') wav_dir = os.path.join(output_dir, 'wavs') #We suppose user will provide correct folder depending on training method log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) metadata_filename = os.path.join(args.mels_dir, 'map.txt') with open(metadata_filename, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[-1]) for x in metadata]) * frame_shift_ms / (3600) log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata), hours)) metadata = np.array(metadata) mel_files = metadata[:, 1] texts = metadata[:, 0] log('Starting synthesis! (this will take a while..)') os.makedirs(log_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: for i, mel_file in enumerate(tqdm(mel_files)): mel_spectro = np.load(mel_file) audio_file = synth.synthesize(mel_spectro, None, i + 1, wav_dir, log_dir) if texts is None: file.write('{}|{}\n'.format(mel_file, audio_file)) else: file.write('{}|{}|{}\n'.format(texts[i], mel_file, audio_file)) log('synthesized audio waveforms at {}'.format(wav_dir))
def run_live(args, checkpoint_path): #Log to Terminal without keeping any records in files log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' log(greetings) generate_fast(synth, greetings) #Interaction loop while True: try: text = input() generate_fast(synth, text) except KeyboardInterrupt: leave = 'Thank you for testing our features. see you soon.' log(leave) generate_fast(synth, leave) sleep(2) break
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'embedding', [len(symbols), 256], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] # embedded_inputs = inputs # Encoder # n_fft = (self._hparams.num_src_freq - 1) * 2 # in_layer_size = n_fft in_layer_size = self._hparams.num_src_freq prenet_outputs = prenet(inputs, is_training, layer_sizes=[in_layer_size, 128]) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] # Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(GRUCell(256), is_training), BahdanauAttention(256, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper( attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, 256), ResidualWrapper(GRUCell(256)), ResidualWrapper(GRUCell(256)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' input: %d' % inputs.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None is_teacher_force_generating = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings # embedding_table = tf.get_variable( # 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, # initializer=tf.truncated_normal_initializer(stddev=0.5)) # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] if hp.use_gst: #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder # prenet_outputs = prenet(embedded_inputs, is_training) prenet_outputs = prenet(inputs, is_training) # [N, T_in, 128] encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256] if inputs_jp is not None: # Reference encoder refnet_outputs = reference_encoder( inputs_jp, filters=hp.reference_filters, kernel_size=(3,3), strides=(2,2), encoder_cell=GRUCell(hp.reference_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs if hp.use_gst: # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) style_embeddings = style_attention.multi_head_attention() # [N, 1, 256] else: style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] else: print("Use random weight for GST.") random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_cell = AttentionWrapper( GRUCell(hp.attention_depth), BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths), alignment_history=True, output_attention=False) # [N, T_in, 256] # Concatenate attention context vector and RNN cell output. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ OutputProjectionWrapper(concat_cell, hp.rnn_depth), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)), ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)) ], state_is_tuple=True) # [N, T_in, 256] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if is_training or is_teacher_force_generating: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] # Add post-processing CBHG: post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.inputs_jp = inputs_jp log('Initialized Tacotron model. Dimensions: ') log(' style embedding: %d' % style_embeddings.shape[-1]) log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, coordinator, metadata_filename, hparams, config, batches_per_group, data_type, batch_size): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._step = 0 self._offset = defaultdict(lambda: 2) self._batches_per_group = batches_per_group self.rng = np.random.RandomState(config.random_seed) self.data_type = data_type self.batch_size = batch_size self.min_tokens = hparams.min_tokens self.min_n_frame = hparams.reduction_factor * hparams.min_iters self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor self.skip_path_filter = config.skip_path_filter # Load metadata: self._datadir = os.path.dirname(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None], 'loss_coeff'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') ] # Create queue for buffering data: dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32] self._placeholders.append(tf.placeholder(tf.int32, [None], 'inputs'), ) dtypes.append(tf.int32) num_worker = 8 if self.data_type == 'train' else 1 queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.loss_coeff, self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.loss_coeff.set_shape(self._placeholders[2].shape) self.mel_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) self.speaker_id.set_shape(self._placeholders[5].shape) self._cmudict = None # # Load CMUDict: If enabled, this will randomly substitute some words in the training data with # # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for # # synthesis (useful for proper nouns, etc.) # if hparams.use_cmudict: # cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') # if not os.path.isfile(cmudict_path): # raise Exception('If use_cmudict=True, you must download ' + # 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) # self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) # log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) # else: # self._cmudict = None if self.data_type == 'test': examples = [] while True: for data_dir in self._datadir: examples.append(self._get_next_example(data_dir)) #print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) if len(examples) >= self.batch_size: break if len(examples) >= self.batch_size: break self.static_batches = [ examples for _ in range(self._batches_per_group) ] else: self.static_batches = None
def build(self, rgb): """ load variable from npy to build the VGG :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] """ start_time = time.time() log('Building VGG19. Started at: %ds' % start_time) rgb_scaled = rgb * 255.0 # Convert RGB to BGR red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_scaled) assert red.get_shape().as_list()[1:] == [224, 224, 1] assert green.get_shape().as_list()[1:] == [224, 224, 1] assert blue.get_shape().as_list()[1:] == [224, 224, 1] bgr = tf.concat(axis=3, values=[ blue - VGG_MEAN[0], green - VGG_MEAN[1], red - VGG_MEAN[2], ]) assert bgr.get_shape().as_list()[1:] == [224, 224, 3] self.conv1_1 = self.conv_layer(bgr, "conv1_1") self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2") self.pool1 = self.max_pool(self.conv1_2, 'pool1') self.conv2_1 = self.conv_layer(self.pool1, "conv2_1") self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2") self.pool2 = self.max_pool(self.conv2_2, 'pool2') self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") self.conv3_4 = self.conv_layer(self.conv3_3, "conv3_4") self.pool3 = self.max_pool(self.conv3_4, 'pool3') self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") self.conv4_4 = self.conv_layer(self.conv4_3, "conv4_4") self.pool4 = self.max_pool(self.conv4_4, 'pool4') self.conv5_1 = self.conv_layer(self.pool4, "conv5_1") self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2") self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3") self.conv5_4 = self.conv_layer(self.conv5_3, "conv5_4") self.pool5 = self.max_pool(self.conv5_4, 'pool5') self.fc6 = self.fc_layer(self.pool5, "fc6") assert self.fc6.get_shape().as_list()[1:] == [4096] self.relu6 = tf.nn.relu(self.fc6) self.fc7 = self.fc_layer(self.relu6, "fc7") self.relu7 = tf.nn.relu(self.fc7) self.fc8 = self.fc_layer(self.relu7, "fc8") log("finished building VGG19 in %ds" % (time.time() - start_time)) return self.fc8
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, reference_mel=None): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256] #Global style tokens (GST) gst_tokens = tf.get_variable( 'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) self.gst_tokens = gst_tokens # Encoder encoder_outputs = encoder(embedded_inputs, input_lengths, is_training, 512, 5, 256) # [N, T_in, 256] if is_training: reference_mel = mel_targets if reference_mel is not None: # Reference encoder refnet_outputs = reference_encoder( reference_mel, filters=hp.ref_filters, kernel_size=(3, 3), strides=(2, 2), encoder_cell=GRUCell(hp.ref_depth), is_training=is_training) # [N, 128] self.refnet_outputs = refnet_outputs # Style attention style_attention = MultiheadAttention( tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] tf.tanh( tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size, 1, 1 ])), # [N, hp.num_gst, 256/hp.num_heads] num_heads=hp.num_heads, num_units=hp.style_att_dim, attention_type=hp.style_att_type) embedded_tokens = style_attention.multi_head_attention( ) # [N, 1, 256] else: random_weights = tf.constant( hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] * (hp.num_gst - hp.gst_index)], dtype=tf.float32) random_weights = tf.nn.softmax(random_weights, name="random_weights") # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads]) embedded_tokens = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) embedded_tokens = hp.gst_scale * embedded_tokens embedded_tokens = tf.reshape( embedded_tokens, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) # Add style embedding to every text encoder state style_embeddings = tf.tile( embedded_tokens, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1) # Attention attention_mechanism = LocationSensitiveAttention( 128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length=input_lengths, smoothing=False, cumulate_weights=True) decoder_lstm = [ ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i + 1)) for i in range(2) ] decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False) # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) dec_outputs_cell = OutputProjectionWrapper( attention_cell, (hp.num_mels) * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp) else: helper = TacoTestHelper(batch_size, hp) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(dec_outputs_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] x = decoder_mel_outputs for i in range(5): activation = tf.nn.tanh if i != (4) else None x = tf.layers.conv1d(x, filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.encoder_outputs = encoder_outputs self.style_embeddings = style_embeddings self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.reference_mel = reference_mel self.all_vars = tf.trainable_variables() log('Initialized Tacotron model. Dimensions: ') log(' text embedding: %d' % embedded_inputs.shape[-1]) log(' style embedding: %d' % style_embeddings.shape[-1]) # log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) # log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])