def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self.batch_size r = self._hp.reduction_factor if self.static_batches is not None: batches = self.static_batches else: examples = [] for data_dir in self.data_dirs: if self._hp.initial_data_greedy: if self._step < self._hp.initial_phase_step and \ any("krbook" in data_dir for data_dir in self.data_dirs): data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0] if self._step < self._hp.initial_phase_step: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))] else: example = [self._get_next_example(data_dir) \ for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] examples.extend(example) examples.sort(key=lambda x: x[-1]) batches = [examples[i:i+n] for i in range(0, len(examples), n)] self.rng.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1
def _enqueue_next_group(self): start = time.time() # Read a group of examples: n = self.batch_size # 32 r = self._hp.reduction_factor # 4 or 5 min_n_frame,max_n_frame 계산에 사용되었던... if self.static_batches is not None: # 'test'에서는 static_batches를 사용한다. static_batches는 init에서 이미 만들어 놓았다. batches = self.static_batches else: # 'train' examples = [] for data_dir in self.data_dirs: if self._hp.initial_data_greedy: if self._step < self._hp.initial_phase_step and any("krbook" in data_dir for data_dir in self.data_dirs): data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0] if self._step < self._hp.initial_phase_step: # 'initial_phase_step': 8000 example = [self._get_next_example(data_dir) for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))] # _batches_per_group 8,또는 32 만큼의 batch data를 만드낟. 각각의 batch size는 2, 또는 32 else: example = [self._get_next_example(data_dir) for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] examples.extend(example) examples.sort(key=lambda x: x[-1]) # 제일 마지막 기준이니까, len(linear_target) 기준으로 정렬 batches = [examples[i:i+n] for i in range(0, len(examples), n)] self.rng.shuffle(batches) log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) for batch in batches: # batches는 batch의 묶음이다. # test 또는 train mode에 맞게 만든 batches의 batch data를 placeholder에 넘겨준다. feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) # _prepare_batch에서 batch data의 길이를 맞춘다. return 순서 = placeholder순서 self._session.run(self._enqueue_op, feed_dict=feed_dict) self._step += 1
def __init__(self, coordinator, metadata_filename, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._clearner_names = [x.strip() for x in hparams.cleaners.split(',')] self._offset = 0 # Load metadata self._datadir = os.path.dirname(metadata_filename) print(metadata_filename) with open(metadata_filename, encoding='utf-8') as f: self._metadata = [line.strip().split('|') for line in f] hours = sum([int(x[1]) for x in self._metadata]) * hparams.frame_shift_ms / (3600 * 1000) log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, shape=(None, None), name='inputs'), tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), #tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets') ] # Create queue for buffering data queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32], name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.mel_targets.set_shape(self._placeholders[2].shape)
def make_test_batches(self): start = time.time() # Read a group of examples n = self._hparams.batch_size r = self._hparams.reduction_factor # Test on entire test set examples = [] examples_list = [] examples_size = [] data_ratio = 1.0 / len(self._test_meta_list) for idx, _test_meta in enumerate(self._test_meta_list): example = [ self._get_next_example(_test_meta, idx) for _ in range(int(n * self._batches_per_group * data_ratio)) ] example.sort(key=lambda x: x[-1]) examples_size.append(len(example)) examples_list.append(example) examples_size.sort(reverse=True) max_step = examples_size[0] if len(examples_size) > 0 else 0 num_vec = len(examples_size) for index in range(max_step): for num in range(num_vec): if examples_size[num] > index: example = examples_list[num][index] examples.append(example) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches, r
def wavenet_synthesize(hparams, checkpoint): output_dir = hparams.synth_output_dir try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) run_synthesis(checkpoint_path, output_dir, hparams)
def synthesize(args, hparams, checkpoint, sentences=None): output_dir = 'centaur_' + args.output_dir try: checkpoint_path = tf.train.get_checkpoint_state( checkpoint).model_checkpoint_path log('loaded model at {}'.format(checkpoint_path)) except: raise RuntimeError( 'Failed to load checkpoint at {}'.format(checkpoint)) run_eval(checkpoint_path, output_dir, hparams, sentences)
def _enqueue_next_train_group(self): while not self._coord.should_stop(): start = time.time() # Read a group of examples n = self._hparams.batch_size # Bucket examples based on similar output sequence length for efficiency examples = [] # 存放预训练样本 examples_list = [] # 总共有多少个人 examples_size = [] # 每个样本量多大 data_ratio = 1.0 / len(self._train_meta_list) for idx, _train_meta in enumerate(self._train_meta_list): if self._start_step < self._hparams.initial_phase_step: # 'initial_phase_step': 8000 example = [ self._get_next_example(_train_meta, idx) for _ in range( int(n * self._batches_per_group // len(self.data_dirs))) ] else: example = [ self._get_next_example(_train_meta, idx) for _ in range( int(n * self._batches_per_group * data_ratio)) ] example.sort(key=lambda x: x[-1]) examples_size.append(len(example)) examples_list.append(example) examples_size.sort(reverse=True) max_step = examples_size[0] if len(examples_size) > 0 else 0 num_vec = len(examples_size) for index in range(max_step): for num in range(num_vec): if examples_size[num] > index: example = examples_list[num][index] examples.append(example) batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} train batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict( zip(self._placeholders, self._prepare_batch(batch))) self._session.run(self._enqueue_op, feed_dict=feed_dict) self._start_step += 1
def _enqueue_next_group(self): start = time.time() # Read a group of examples n = self._hparams.batch_size r = self._hparams.outputs_per_step examples = [self._get_next_example() for i in range(n * _batches_per_group)] # Bucket examples based on similar output sequence length for efficiency examples.sort(key=lambda x: x[-1]) batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def make_test_batches(self): start = time.time() #Read one example for evaluation n = 1 #Test on entire test set (one sample at an evaluation step) examples = [ self._get_test_groups() for i in range(len(self._test_meta)) ] batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} test batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) return batches
def _enqueue_next_group(self): start = time.time() #Read a group of samples n = self._hparams.batch_size examples = [ self._get_next_example() for i in range(n * _batches_per_group) ] batches = [examples[i:i + n] for i in range(0, len(examples), n)] np.random.shuffle(batches) log('\nGenerated {} batches of size {} in {:.3f} sec'.format( len(batches), n, time.time() - start)) for batch in batches: feed_dict = dict(zip(self._placeholders, _prepare_batch(batch))) self._session.run(self._enqueue_op, feed_dict=feed_dict)
def load(self, checkpoint_path, hparams, model_name='WaveNet'): log('Constructing model: {}'.format(model_name)) self._hparams = hparams local_cond, global_cond = self._check_conditions() self.local_conditions = tf.placeholder( tf.float32, shape=(None, None, hparams.num_mfccs), name='local_condition_features') if local_cond else None self.global_conditions = tf.placeholder( tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None self.synthesis_length = tf.placeholder( tf.int32, shape=(), name='synthesis_length') if not local_cond else None self.input_lengths = tf.placeholder( tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None self.synth_debug = hparams.wavenet_synth_debug with tf.variable_scope('WaveNet_model') as scope: self.model = create_model(model_name, hparams) self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=None) self._hparams = hparams sh_saver = create_shadow_saver(self.model) log('Loading checkpoint: {}'.format(checkpoint_path)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) load_averaged_model(self.session, sh_saver, checkpoint_path)
def run_eval(checkpoint_path, output_dir, hparams, sentences): # Create output path if it doesn't exist os.makedirs(output_dir, exist_ok=True) os.makedirs(os.path.join(output_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(output_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) # Set inputs batch wise sentences = [ sentences[i:i + hparams.synthesis_batch_size] for i in range(0, len(sentences), hparams.synthesis_batch_size) ] log('Starting Synthesis') for i, texts in enumerate(tqdm(sentences)): basenames = ['{}_sentence_{}'.format(i, j) for j in range(len(texts))] synth.synthesize(texts, basenames, output_dir, None)
def get_path_dict(data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: # ['datasets/moon\\data'] paths = glob( "{}/*.npz".format(data_dir) ) # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...] if data_type == 'train': rng.shuffle( paths ) # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...] if not config.skip_path_filter: # items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...] items = [] for path in paths: item = get_frame(path) items.append(item) min_n_frame = hparams.min_n_frame # 5*30 max_n_frame = hparams.max_n_frame - 1 # 5*200 - 5 # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨. new_items = [ (path, n) for path, n, n_tokens in items if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens ] # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames, hparams) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format( data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths # train용 data와 test용 data로 나눈다. if data_type == 'train': new_paths = new_paths[:-n_test] # 끝에 있는 n_test(batch_size)를 제외한 모두 elif data_type == 'test': new_paths = new_paths[-n_test:] # 끝에 있는 n_test else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[ data_dir] = new_paths # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...] return path_dict
def initialize(self, inputs, targets=None): """ Initializes the model for inference set "output" field. Args: - inputs: int32 tensor with shape [batch_size, time_steps] where time steps is typically the number of words in each input sentence - targets: int32 tensor with shape [batch_size, num_classes] which represents the true labels. Only used in training time. """ with tf.variable_scope('inference') as scope: is_training = targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams #Embeddings embedding_table = tf.get_variable( 'intputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training) encoder_outputs, encoder_states = bidirectional_LSTM( enc_conv_outputs, 'encoder_LSTM', is_training=is_training) #Prediction/projection projection_shape = [512, 512] projected = projection_layers(inputs, is_training=is_training, shape=projection_shape, activation=tf.nn.relu) #Logit Layer output = logit_layer(projected, logits_dim=hp.num_classes) self.inputs = inputs self.output = output self.targets = targetst og('Initialized Analyser model. Dimensions: ') log(' embedding: {}'.format( embedded_inputs.shape[-1])) log(' enc conv out: {}'.format( enc_conv_outputs.shape[-1])) log(' encoder out: {}'.format( encoder_outputs.shape[-1])) log(' output: {}'.format(output.shape[-1]))
def run_synthesis(checkpoint_path, output_dir, hparams): log_dir = os.path.join(output_dir, 'plots') wav_dir = os.path.join(output_dir, 'wavs') embed_dir = os.path.join(output_dir, 'embeddings') log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) metadata_filename = os.path.join(hparams.wavenet_synth, 'map.txt') with open(metadata_filename, encoding='utf-8') as f: metadata = np.array([line.strip().split('|') for line in f]) if (hparams.synth_mode == "all") and (hparams.synth_idx != None): # if synth mode is all and synth_idx is not None, extract a part of metadata metadata = metadata[hparams.synth_idx[0]:hparams.synth_idx[1], :] # speaker ids from trained speakers list speaker_ids = metadata[:, 3] print("spk_ids" +str(speaker_ids.shape)) mel_files = metadata[:, 1] print("mel_files" +str(mel_files.shape)) log('Starting synthesis! (this will take a while..)') os.makedirs(log_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(embed_dir, exist_ok=True) synth_dict = load_synthesis_dict() for idx, mel_file in enumerate(tqdm(mel_files)): print("idx") print(idx) mel_spectro = [np.load(mel_file)] basenames = [os.path.basename(mel_file).replace('.npy', '')] speaker_id = [speaker_ids[idx]] print("synthesizing {}".format(basenames[0])) if hparams.synth_mode == "all": if basenames[0].split('-')[1] in synth_dict.keys(): print("Synthesizing both wav and embedding") synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=False) else: print("Synthesizing embedding only") synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=True) elif hparams.synth_mode == "embedding": print("Synthesizing embedding only") synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=True) elif hparams.synth_mode == "wav": if basenames[0].split('-')[1] in synth_dict.keys(): synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=False) else: print("Not supported synth mode.") log('synthesized audio waveforms at {}'.format(wav_dir))
def load(self, checkpoint_path, hparams, freezer=False): log('Constructing model: Centaur') if freezer: try: checkpoint_path = tf.train.get_checkpoint_state(checkpoint_path).model_checkpoint_path except: raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint_path)) # Force the batch size to be known in order to use attention masking in batch synthesis self.inputs = tf.placeholder(tf.int32, (None, None), name='inputs') self.input_lengths = tf.placeholder(tf.int32, (None,), name='input_lengths') with tf.variable_scope('model', reuse=tf.AUTO_REUSE): self.model = create_model(hparams) self.model.initialize(self.inputs, self.input_lengths, is_training=False, is_validation=False, is_prediction=True) self.mel_outputs = self.model.decoder_predictions self.linear_outputs = self.model.mag_pred self.alignments = self.model.alignments self.wav_output = self.model.audio self.stop_token_prediction = self.model.stop_token_predictions self.audio_length = self.model.sequence_lengths self._hparams = hparams # pad input sequences with the <pad_token> 0 ( _ ) self._pad = 0 log('Loading checkpoint: %s' % checkpoint_path) # Memory allocation on the GPUs as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.session = tf.Session(config=config) self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(self.session, checkpoint_path)
def get_path_dict(data_dirs, hparams, config, data_type, n_test=None, rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: paths = glob("{}/*.npz".format(data_dir)) if data_type == 'train': rng.shuffle(paths) if not config.skip_path_filter: items = parallel_run(get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor new_items = [(path, n) for path, n, n_tokens in items \ if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items \ if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames, default=0))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames, default=0))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths return path_dict
def get_path_dict(data_dirs, hparams, config,data_type, n_test=None,rng=np.random.RandomState(123)): # Load metadata: path_dict = {} for data_dir in data_dirs: # ['datasets/moon\\data'] paths = glob("{}/*.npz".format(data_dir)) # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...] if data_type == 'train': rng.shuffle(paths) # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...] if not config.skip_path_filter: items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...] min_n_frame = hparams.reduction_factor * hparams.min_iters # 5*30 max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor # 5*200 - 5 # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨. new_items = [(path, n) for path, n, n_tokens in items if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...] if any(check in data_dir for check in ["son", "yuinna"]): blacklists = [".0000.", ".0001.", "NB11479580.0001"] new_items = [item for item in new_items if any(check not in item[0] for check in blacklists)] new_paths = [path for path, n in new_items] new_n_frames = [n for path, n in new_items] hours = frames_to_hours(new_n_frames,hparams) log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format(data_dir, len(new_n_frames), hours)) log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) else: new_paths = paths if data_type == 'train': new_paths = new_paths[:-n_test] elif data_type == 'test': new_paths = new_paths[-n_test:] else: raise Exception(" [!] Unkown data_type: {}".format(data_type)) path_dict[data_dir] = new_paths # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...] return path_dict
def __init__(self, coordinator, input_path, hparams): super(Feeder, self).__init__() self._coord = coordinator self._hparams = hparams self._start_step = 0 self._batches_per_group = 32 self._train_offset_list = [] self._test_offset_list = [] self._pad = 0 # Load metadata input_paths = [input_path] if not os.path.exists(os.path.join(input_path, 'train.txt')): path_list = os.listdir(input_path) input_paths = [] for name in path_list: input_paths.append( os.path.abspath(os.path.join(input_path, name))) self.data_dirs = input_paths all_hours = 0.0 metadata_size = 0 self._metadata_list = [] for input_path in input_paths: with open(os.path.join(input_path, 'train.txt'), encoding='utf-8') as f: metadata_vec = [] for line in f: npz_filename, time_steps, mel_frames, text = line.strip( ).split('|') metadata_vec.append([ os.path.join(input_path, os.path.basename(npz_filename)), time_steps, mel_frames, text ]) self._metadata_list.append(metadata_vec) frame_shift_ms = hparams.hop_size / hparams.sample_rate hours = sum([int(x[2]) for x in metadata_vec]) * frame_shift_ms / 3600 all_hours += hours log('Loaded metadata for {} examples ({:.2f} hours)'.format( len(metadata_vec), hours)) metadata_size += len(metadata_vec) self._train_offset_list.append(0) self._test_offset_list.append(0) log('Loaded ({:.2f} hours)'.format(all_hours)) # Train test split if hparams.test_size is None: assert hparams.test_batches is not None test_size = (hparams.test_size if hparams.test_size is not None else hparams.test_batches * hparams.batch_size) self._train_meta_list = [] self._test_meta_list = [] if self._hparams.symmetric_mels: self._pad_value = -self._hparams.max_abs_value else: self._pad_value = 0. data_ratio = 1.0 // len(self._metadata_list) test_size = test_size * data_ratio sum_test_meta = 0 for metadata in self._metadata_list: indices = np.arange(len(metadata)) train_indices, test_indices = train_test_split( indices, test_size=test_size, random_state=hparams.data_random_state) # Make sure test_indices is a multiple of batch_size else round down len_test_indices = self._round_down(len(test_indices), hparams.batch_size) extra_test = test_indices[len_test_indices:] test_indices = test_indices[:len_test_indices] train_indices = np.concatenate([train_indices, extra_test]) _train_meta = list(np.array(metadata)[train_indices]) _test_meta = list(np.array(metadata)[test_indices]) sum_test_meta += len(_test_meta) self._train_meta_list.append(_train_meta) self._test_meta_list.append(_test_meta) self.test_steps = sum_test_meta // hparams.batch_size if hparams.test_size is None: assert hparams.test_batches == self.test_steps with tf.device('/cpu:0'): # Create placeholders for inputs and targets. Don't specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder( tf.float32, [None, None, hparams.num_mels + hparams.num_freq], 'target_mels'), tf.placeholder(tf.int32, (None, ), 'target_lengths'), tf.placeholder(tf.float32, (None, None), 'stop_tokens'), ] dtypes = [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32] # Create queue for buffering data queue = tf.FIFOQueue(8, dtypes, name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.target_feats, self.target_lengths, self.stop_tokens = queue.dequeue( ) self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.target_feats.set_shape(self._placeholders[2].shape) self.target_lengths.set_shape(self._placeholders[3].shape) self.stop_tokens.set_shape(self._placeholders[4].shape) # Create eval queue for buffering eval data eval_queue = tf.FIFOQueue(1, dtypes, name='eval_queue') self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_target_feats, self.eval_target_lengths, self.eval_stop_tokens, = eval_queue.dequeue( ) self.eval_inputs.set_shape(self._placeholders[0].shape) self.eval_input_lengths.set_shape(self._placeholders[1].shape) self.eval_target_feats.set_shape(self._placeholders[2].shape) self.eval_target_lengths.set_shape(self._placeholders[3].shape) self.eval_stop_tokens.set_shape(self._placeholders[4].shape)
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training2 = linear_targets is not None # test에서 이게 True로 되는데, 이게 의도한 것인가??? is_training = not rnn_decoder_test_mode self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: # speaker_embedding_size = f(16) speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed( speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway" ) # 'enc_prenet_sizes': [f(256), f(128)] encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: deep_dense = lambda x, dim: tf.layers.dense( x, dim, activation=tf.nn.softsign ) # softsign: x / (abs(x) + 1) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다. before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: # self.num_speakers =1인 경우 speaker_embed = None before_highway = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet( char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet' ) # 'enc_prenet_sizes': [f(256), f(128)], dropout_prob = 0.5 # ==> (N, T_in, 128) # enc_rnn_size = 128 encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) # single: attention_size = 128 if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_mon_norm_hccho': attention_mechanism = BahdanauMonotonicAttention_hccho( hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다. # carpedm20은 tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만, keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다. attention_cell = AttentionWrapper( GRUCell(hp.attention_state_size), attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 dec_prenet_outputs = DecoderPrenetWrapper( attention_cell, speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) # dec_prenet_sizes = [f(256), f(128)] # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다. # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ] concat_cell = ConcatOutputAndAttentionWrapper( dec_prenet_outputs, embed_to_concat=speaker_embed ) # concat(output,attention,speaker_embed)해서 새로운 output을 만든다. # Decoder (layers specified bottom to top): dec_rnn_size= 256 cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size) ] # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데... for _ in range(hp.dec_layer_num): # hp.dec_layer_num = 2 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor ) # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까??? (hp.num_mels+1) * hp.reduction_factor decoder_init_state = output_cell.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. ) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training2: # rnn_decoder_test_mode = True if test mode, train mode에서는 False helper = TacoTrainingHelper( inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters) # max_iters=200 # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = tf.concat( [tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense( post_outputs, hp.num_freq) # [N, T_out, F(1025)] # Grab alignments from the final decoder state: # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다. ==> final_decoder_state[0] alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize( self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False, ): is_training = linear_targets is not None self.is_randomly_initialized = is_randomly_initialized with tf.variable_scope('inference') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings char_embed_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, embedding_size] char_embedded_inputs = \ tf.nn.embedding_lookup(char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: if hp.speaker_embedding_size != 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer( stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup( speaker_embed_table, speaker_id) if hp.model_type == 'deepvoice': if hp.speaker_embedding_size == 1: before_highway = get_embed(speaker_id, self.num_speakers, hp.enc_prenet_sizes[-1], "before_highway") encoder_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.enc_rnn_size * 2, "encoder_rnn_init_state") attention_rnn_init_state = get_embed( speaker_id, self.num_speakers, hp.attention_state_size, "attention_rnn_init_state") decoder_rnn_init_states = [ get_embed( speaker_id, self.num_speakers, hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) for idx in range(hp.dec_layer_num) ] else: def deep_dense(x, dim): return \ tf.layers.dense(x, dim, activation=tf.nn.softsign) before_highway = deep_dense(speaker_embed, hp.enc_prenet_sizes[-1]) encoder_rnn_init_state = deep_dense( speaker_embed, hp.enc_rnn_size * 2) attention_rnn_init_state = deep_dense( speaker_embed, hp.attention_state_size) decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num) ] speaker_embed = None # deepvoice does not use speaker_embed directly elif hp.model_type == 'simple': before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None else: raise Exception( " [!] Unkown multi-speaker model type: {}".format( hp.model_type)) else: speaker_embed = None before_highway = None encoder_rnn_init_state = None attention_rnn_init_state = None decoder_rnn_init_states = None ############## # Encoder ############## # [N, T_in, enc_prenet_sizes[-1]] prenet_outputs = prenet(char_embedded_inputs, is_training, hp.enc_prenet_sizes, hp.dropout_prob, scope='prenet') encoder_outputs = cbhg( prenet_outputs, input_lengths, is_training, hp.enc_bank_size, hp.enc_bank_channel_size, hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, hp.enc_proj_sizes, hp.enc_proj_width, scope="encoder_cbhg", before_highway=before_highway, encoder_rnn_init_state=encoder_rnn_init_state) ############## # Attention ############## # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) dec_prenet_outputs = DecoderPrenetWrapper( GRUCell(hp.attention_state_size), speaker_embed, is_training, hp.dec_prenet_sizes, hp.dropout_prob) if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention(hp.attention_size, encoder_outputs, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention(hp.attention_size, encoder_outputs) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs) elif hp.attention_type.startswith('ntm2'): shift_width = int(hp.attention_type.split('-')[-1]) attention_mechanism = NTMAttention2(hp.attention_size, encoder_outputs, shift_width=shift_width) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) attention_cell = AttentionWrapper( dec_prenet_outputs, attention_mechanism, self.is_manual_attention, self.manual_alignments, initial_cell_state=attention_rnn_init_state, alignment_history=True, output_attention=False) # Concatenate attention context vector and RNN cell output into a 512D vector. # [N, T_in, attention_size+attention_state_size] concat_cell = ConcatOutputAndAttentionWrapper( attention_cell, embed_to_concat=speaker_embed) # Decoder (layers specified bottom to top): cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] for _ in range(hp.dec_layer_num): cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) # [N, T_in, 256] decoder_cell = MultiRNNCell(cells, state_is_tuple=True) # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.reduction_factor) decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) if hp.model_type == "deepvoice": # decoder_init_state[0] : AttentionWrapperState # = cell_state + attention + time + alignments + alignment_history # decoder_init_state[0][0] = attention_rnn_init_state (already applied) decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx + 1].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1 != shape2: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) decoder_init_state[idx + 1] = cell decoder_init_state = tuple(decoder_init_state) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.reduction_factor, rnn_decoder_test_mode) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out, M] mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # Add post-processing CBHG: # [N, T_out, 256] #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') if speaker_embed is not None and hp.model_type == 'simple': expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) tiled_speaker_embedding = tf.tile( expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) # [N, T_out, 256 + alpha] post_outputs = \ tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) if speaker_embed is not None: log(' speaker embedding: %d' % speaker_embed.shape[-1]) else: log(' speaker embedding: None') log(' prenet out: %d' % prenet_outputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, num_speakers, speaker_id=None, mel_targets=None, linear_targets=None, is_training=False, loss_coeff=None, stop_token_targets=None): with tf.variable_scope('Eembedding') as scope: hp = self._hparams batch_size = tf.shape(inputs)[0] # Embeddings(256) char_embed_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) zero_pad = True if zero_pad: # transformer에 구현되어 있는 거 보고, 가져온 로직. # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다) char_embed_table = tf.concat( (tf.zeros(shape=[1, hp.embedding_size]), char_embed_table[1:, :]), 0) # [N, T_in, embedding_size] char_embedded_inputs = tf.nn.embedding_lookup( char_embed_table, inputs) self.num_speakers = num_speakers if self.num_speakers > 1: speaker_embed_table = tf.get_variable( 'speaker_embedding', [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) # [N, T_in, speaker_embedding_size] speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) deep_dense = lambda x, dim, name: tf.layers.dense( x, dim, activation=tf.nn.softsign, name=name ) # softsign: x / (abs(x) + 1) encoder_rnn_init_state = deep_dense( speaker_embed, hp.encoder_lstm_units * 4, 'encoder_init_dense') # hp.encoder_lstm_units = 256 decoder_rnn_init_states = [ deep_dense(speaker_embed, hp.decoder_lstm_units * 2, 'decoder_init_dense_{}'.format(i)) for i in range(hp.decoder_layers) ] # hp.decoder_lstm_units = 1024 speaker_embed = None else: # self.num_speakers =1인 경우 speaker_embed = None encoder_rnn_init_state = None # bidirectional GRU의 init state attention_rnn_init_state = None decoder_rnn_init_states = None with tf.variable_scope('Encoder') as scope: ############## # Encoder ############## x = char_embedded_inputs for i in range(hp.enc_conv_num_layers): x = tf.layers.conv1d(x, filters=hp.enc_conv_channels, kernel_size=hp.enc_conv_kernel_size, padding='same', activation=tf.nn.relu, name='Encoder_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='dropout_{}'.format(i)) if encoder_rnn_init_state is not None: initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split( encoder_rnn_init_state, 4, 1) initial_state_fw = LSTMStateTuple(initial_state_fw_c, initial_state_fw_h) initial_state_bw = LSTMStateTuple(initial_state_bw_c, initial_state_bw_h) else: # single mode initial_state_fw, initial_state_bw = None, None cell_fw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') cell_bw = ZoneoutLSTMCell( hp.encoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='encoder_fw_LSTM') encoder_conv_output = x outputs, states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw, dtype=tf.float32) # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] encoder_outputs = tf.concat( outputs, axis=2) # Concat and return forward + backward outputs with tf.variable_scope('Decoder') as scope: ############## # Attention ############## if hp.attention_type == 'bah_mon': attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=False) elif hp.attention_type == 'bah_mon_norm': # hccho 추가 attention_mechanism = BahdanauMonotonicAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'loc_sen': # Location Sensitivity Attention attention_mechanism = LocationSensitiveAttention( hp.attention_size, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) elif hp.attention_type == 'gmm': # GMM Attention attention_mechanism = GmmAttention( hp.attention_size, memory=encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah_norm': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, normalize=True) elif hp.attention_type == 'luong_scaled': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths, scale=True) elif hp.attention_type == 'luong': attention_mechanism = LuongAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) elif hp.attention_type == 'bah': attention_mechanism = BahdanauAttention( hp.attention_size, encoder_outputs, memory_sequence_length=input_lengths) else: raise Exception(" [!] Unkown attention type: {}".format( hp.attention_type)) decoder_lstm = [ ZoneoutLSTMCell(hp.decoder_lstm_units, is_training, zoneout_factor_cell=hp.tacotron_zoneout_rate, zoneout_factor_output=hp.tacotron_zoneout_rate, name='decoder_LSTM_{}'.format(i + 1)) for i in range(hp.decoder_layers) ] decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) decoder_init_state = decoder_lstm.zero_state( batch_size=batch_size, dtype=tf.float32 ) # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다. if hp.model_type == "multi-speaker": decoder_init_state = list(decoder_init_state) for idx, cell in enumerate(decoder_rnn_init_states): shape1 = decoder_init_state[idx][0].get_shape().as_list() shape2 = cell.get_shape().as_list() if shape1[1] * 2 != shape2[1]: raise Exception( " [!] Shape {} and {} should be equal".format( shape1, shape2)) c, h = tf.split(cell, 2, 1) decoder_init_state[idx] = LSTMStateTuple(c, h) decoder_init_state = tuple(decoder_init_state) attention_cell = AttentionWrapper( decoder_lstm, attention_mechanism, initial_cell_state=decoder_init_state, alignment_history=True, output_attention=False ) # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다. # attention_state_size = 256 # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] dec_prenet_outputs = DecoderWrapper(attention_cell, is_training, hp.dec_prenet_sizes, hp.dropout_prob, hp.inference_prenet_dropout) dec_outputs_cell = OutputProjectionWrapper( dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor) if is_training: helper = TacoTrainingHelper( mel_targets, hp.num_mels, hp.reduction_factor) # inputs은 batch_size 계산에만 사용됨 else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.reduction_factor) decoder_init_state = dec_outputs_cell.zero_state( batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = \ tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor)) # max_iters=200 decoder_mel_outputs = tf.reshape( decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor], [batch_size, -1, hp.num_mels ]) # [N,iters,400] -> [N,5*iters,80] stop_token_outputs = tf.reshape( decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:], [batch_size, -1]) # [N,iters] # Postnet x = decoder_mel_outputs for i in range(hp.postnet_num_layers): activation = tf.nn.tanh if i != (hp.postnet_num_layers - 1) else None x = tf.layers.conv1d(x, filters=hp.postnet_channels, kernel_size=hp.postnet_kernel_size, padding='same', activation=activation, name='Postnet_{}'.format(i)) x = tf.layers.batch_normalization(x, training=is_training) x = tf.layers.dropout(x, rate=hp.dropout_prob, training=is_training, name='Postnet_dropout_{}'.format(i)) residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') mel_outputs = decoder_mel_outputs + residual # Add post-processing CBHG: # mel_outputs: (N,T,num_mels) post_outputs = cbhg(mel_outputs, None, is_training, hp.post_bank_size, hp.post_bank_channel_size, hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, hp.post_proj_sizes, hp.post_proj_width, scope='post_cbhg') linear_outputs = tf.layers.dense( post_outputs, hp.num_freq, name='linear_spectogram_projection') # [N, T_out, F(1025)] # Grab alignments from the final decoder state: alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0 ]) # batch_size, text length(encoder), target length(decoder) self.inputs = inputs self.speaker_id = speaker_id self.input_lengths = input_lengths self.loss_coeff = loss_coeff self.decoder_mel_outputs = decoder_mel_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets self.final_decoder_state = final_decoder_state self.stop_token_targets = stop_token_targets self.stop_token_outputs = stop_token_outputs self.all_vars = tf.trainable_variables() log('=' * 40) log(' model_type: %s' % hp.model_type) log('=' * 40) log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % char_embedded_inputs.shape[-1]) log(' encoder conv out: %d' % encoder_conv_output.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' decoder prenet lstm concat out : %d' % dec_prenet_outputs.output_size) log(' decoder cell out: %d' % dec_outputs_cell.output_size) log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) log(' decoder mel out: %d' % decoder_mel_outputs.shape[-1]) log(' mel out: %d' % mel_outputs.shape[-1]) log(' postnet out: %d' % post_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1]) log(' Tacotron Parameters {:.3f} Million.'.format( np.sum( [np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, num_speakers, speaker_id, mel_targets=None, linear_targets=None, loss_coeff=None, rnn_decoder_test_mode=False, is_randomly_initialized=False): '''Initializes the model for inference. Sets "mel_outputs", "linear_outputs", and "alignments" fields. Args: inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number of steps in the output time series, F is num_freq, and values are entries in the linear spectrogram. Only needed for training. ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None self.batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 512] # Encoder encoder_outputs = conv_and_lstm( embedded_inputs, input_lengths, conv_layers=hp.encoder_conv_layers, conv_width=hp.encoder_conv_width, conv_channels=hp.encoder_conv_channels, lstm_units=hp.encoder_lstm_units, is_training=is_training, scope='encoder') # [N, T_in, 512] # Attention # For manaul control of attention self.is_manual_attention = tf.placeholder( tf.bool, shape=(), name='is_manual_attention', ) self.manual_alignments = tf.placeholder( tf.float32, shape=[None, None, None], name="manual_alignments", ) attention_cell = AttentionWrapper( DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training), LocationSensitiveAttention(hp.attention_depth, encoder_outputs), alignment_history=True, output_attention=False) # [N, T_in, 128] # Concatenate attention context vector and RNN cell output into a 512D vector. concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512] # Decoder (layers specified bottom to top): decoder_cell = MultiRNNCell([ concat_cell, LSTMBlockCell(hp.decoder_lstm_units), LSTMBlockCell(hp.decoder_lstm_units) ], state_is_tuple=True) # [N, T_in, 1024] # Project onto r mel spectrograms (predict r outputs at each RNN step): output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step) if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step) decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder(output_cell, helper, decoder_init_state), maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] # Reshape outputs to be one output per entry [N, T_out, M] decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels]) # Postnet: predicts a residual postnet_outputs = postnet( decoder_outputs, layers=hp.postnet_conv_layers, conv_width=hp.postnet_conv_width, channels=hp.postnet_conv_channels, is_training=is_training) mel_outputs = decoder_outputs + postnet_outputs # Convert to linear using a similar architecture as the encoder: expand_outputs = conv_and_lstm( mel_outputs, None, conv_layers=hp.expand_conv_layers, conv_width=hp.expand_conv_width, conv_channels=hp.expand_conv_channels, lstm_units=hp.expand_lstm_units, is_training=is_training, scope='expand') # [N, T_in, 512] linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq) # [N, T_out, F] # Grab alignments from the final decoder state: alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_outputs = decoder_outputs self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alignments = alignments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: %d' % embedded_inputs.shape[-1]) log(' encoder out: %d' % encoder_outputs.shape[-1]) log(' attention out: %d' % attention_cell.output_size) log(' concat attn & out: %d' % concat_cell.output_size) log(' decoder cell out: %d' % decoder_cell.output_size) log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) log(' expand out: %d' % expand_outputs.shape[-1]) log(' linear out: %d' % linear_outputs.shape[-1])
def __init__(self, data_dirs, hparams, config, batches_per_group, data_type, batch_size): super(DataFeederTacotron, self).__init__() self._hp = hparams self._step = 0 self._offset = defaultdict(lambda: 2) self._batches_per_group = batches_per_group self.rng = np.random.RandomState( config.random_seed) # random number generator self.data_type = data_type self.batch_size = batch_size self.min_tokens = hparams['min_tokens'] # 30 self.min_n_frame = hparams['reduction_factor'] * hparams[ 'min_iters'] # 5*30 self.max_n_frame = hparams['reduction_factor'] * hparams[ 'max_iters'] - hparams['reduction_factor'] # 5*200 - 5 # Load metadata: self.path_dict = get_path_dict( data_dirs, self._hp, config, self.data_type, n_test=self.batch_size, rng=self.rng) # data_dirs: ['datasets/moon\\data'] self.data_dirs = list(self.path_dict.keys()) # ['datasets/moon\\data'] self.data_dir_to_id = { data_dir: idx for idx, data_dir in enumerate(self.data_dirs) } # {'datasets/moon\\data': 0} data_weight = {data_dir: 1. for data_dir in self.data_dirs } # {'datasets/moon\\data': 1.0} weight_Z = sum(data_weight.values()) self.data_ratio = { data_dir: weight / weight_Z for data_dir, weight in data_weight.items() } self.is_multi_speaker = len(self.data_dirs) > 1 log("=" * 40) log(pprint.pformat(self.data_ratio, indent=4)) log("=" * 40) if self.data_type == 'test': examples = [] while True: for data_dir in self.data_dirs: examples.append(self._get_next_example(data_dir)) # print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) if len(examples) >= self.batch_size: break if len(examples) >= self.batch_size: break # test 할 때는 같은 examples로 계속 반복 self.static_batches = [ examples for _ in range(self._batches_per_group) ] # [examples, examples,...,examples] <--- 각 example은 2개의 data를 가지고 있다. else: self.static_batches = None # Read a group of examples: n = self.batch_size # 32 r = self._hp[ 'reduction_factor'] # 4 or 5 min_n_frame,max_n_frame 계산에 사용되었던... start = time.time() if self.static_batches is not None: # 'test'에서는 static_batches를 사용한다. static_batches는 init에서 이미 만들어 놓았다. batches = self.static_batches else: # 'train' examples = [] for data_dir in self.data_dirs: if self._hp['initial_data_greedy']: if self._step < self._hp['initial_phase_step'] and any( "krbook" in data_dir for data_dir in self.data_dirs): data_dir = [ data_dir for data_dir in self.data_dirs if "krbook" in data_dir ][0] if self._step < self._hp[ 'initial_phase_step']: # 'initial_phase_step': 8000 example = [ self._get_next_example(data_dir) for _ in range( int(n * self._batches_per_group // len(self.data_dirs))) ] # _batches_per_group 8,또는 32 만큼의 batch data를 만드낟. 각각의 batch size는 2, 또는 32 else: example = [ self._get_next_example(data_dir) for _ in range( int(n * self._batches_per_group * self.data_ratio[data_dir])) ] examples.extend(example) examples.sort(key=lambda x: x[-1] ) # 제일 마지막 기준이니까, len(linear_target) 기준으로 정렬 self.len = np.shape(examples)[0] examples_len = len(examples) self.input_data = [examples[i][0] for i in range(examples_len)] self.loss_coeff = [examples[i][1] for i in range(examples_len)] self.mel_target = [examples[i][2] for i in range(examples_len)] self.linear_target = [examples[i][3] for i in range(examples_len)] self.stop_token_target = [examples[i][4] for i in range(examples_len)] if self.is_multi_speaker: self.id = [examples[i][5] for i in range(examples_len)] self.linear_target_len = [ examples[i][6] for i in range(examples_len) ] else: self.linear_target_len = [ examples[i][5] for i in range(examples_len) ] log('Generated %d batches of size %d in %.03f sec' % (len(examples) // 32, n, time.time() - start))
def initialize(self, inputs, input_lengths, mel_targets=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ with tf.variable_scope('inference') as scope: is_training = mel_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training) # Paper doesn't specify what to do with final encoder state # We send them however to the attention mechanism as source state # (direct link between source and targets cells) encoder_outputs, encoder_states = bidirectional_LSTM( enc_conv_outputs, input_lengths, 'encoder_LSTM', is_training=is_training) # DecoderWrapper decoder_cell = TacotronDecoderWrapper( unidirectional_LSTM(is_training, layers=hp.num_decoder_layers, size=512), is_training) # AttentionWrapper on top of TacotronDecoderWrapper attention_decoder = AttentionWrapper( decoder_cell, LocationBasedAttention(hp.attention_dim, encoder_outputs), alignment_history=True, output_attention=False, name='attention_decoder_wrapper') # We pass (num_decoder_layers times) encoder final states to the decoder of #layers (num_decoder_layers) decoder_init_state = attention_decoder.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=tuple( encoder_states for _ in range(hp.num_decoder_layers))) # Define the helper for our decoder if is_training: helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) # We"ll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters # Decode (decoder_output, _), final_decoder_state, self.stop_error = dynamic_decode( CustomDecoder(attention_decoder, helper, decoder_init_state), impute_finished=True, maximum_iterations=max_iterations) # Compute residual using post-net residual = postnet(decoder_output, is_training) # Project residual to same dimension as mel spectogram proj_dim = hp.num_mels projected_residual = projection(residual, shape=proj_dim, scope='residual_projection') # Compute the mel spectogram mel_outputs = decoder_output + projected_residual # Grab alignments from the final decoder state alignments = tf.transpose( final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format( enc_conv_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format( projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape))
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): ''' 初始化模型以进行inference :param inputs:[N,T_in], 其中N为batch_size, T_in是输入时间序列的步数,张量中的值为字符的id :param input_lengths:[N], 其中N为batch_size, 张量中的值为每个输入序列的长度 :param mel_targets:[N,T_out,M], 其中N为batch_size,T_out为输出序列的步数,M为num_mels,张量中的值为mel谱的entries #### :param linear_targets:[N,T_out,F],其中N为batch_size,T_out为输出序列的步数,F为num_freq,张量中的值为线性谱的entries#### :return: ''' with tf.variable_scope('inference') as scope: is_training = linear_targets is not None batch_size = tf.shape(inputs)[0] hp = self._hparam embedding_table = tf.get_variable( 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) embedded_inputs = tf.nn.embedding_lookup( embedding_table, inputs) # [N,T_in,embed_depth=256] # encoder prenet_outputs = prenet( embedded_inputs, is_training, hp.prenet_depth) # [N,T_in,prenet_depths[-1]=128] encoder_outputs = encoder_cbhg( prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N,T_in,encoder_depth=256] # attention attention_mechanism = LocationSensitiveAttention( hp.attention_depth, encoder_outputs) # decoder multi_rnn_cell = MultiRNNCell( [ ResidualWrapper(GRUCell(hp.decoder_depth)), ResidualWrapper(GRUCell(hp.decoder_depth)) ], state_is_tuple=True) # [N,T_id,decoder_depth=256] # 投影到r个mel谱上(在每一个RNN步上,投影r个输出) decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell) if is_training: helper = TacoTrainHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_steps) else: helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_steps) decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( BasicDecoder( OutputProjectionWrapper( decoder_cell, hp.num_mels * hp.outputs_per_steps), helper, decoder_init_state), maxmum_iterations=hp.max_iters) # [N,T_out/r,M*r] # 将输出reshape到每个entry对应一个输出 mel_outputs = tf.reshape( decoder_outputs, [batch_size, -1, hp.num_mels]) # [N,T_out,M] post_outputs = post_cbhg( mel_outputs, hp.num_mels, is_training, hp.postnet_depth) # [N,T_out,postnet_depth=256] linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N,T_out,F] # 从最终的解码器状态获取对齐信息 alighments = tf.transpose( final_decoder_state.alighment_history.stack(), [1, 2, 0]) #### self.inputs = inputs self.input_lengths = input_lengths self.mel_outputs = mel_outputs self.linear_outputs = linear_outputs self.alighments = alighments self.mel_targets = mel_targets self.linear_targets = linear_targets log('Initialized Tacotron model. Dimensions: ') log('embedding: %d' % embedded_inputs.shape[-1]) log('prenet out: %d' % prenet_outputs.shape[-1]) log('encoder out: %d' % encoder_outputs.shape[-1]) log('decoder out (%d frames): %d' % (hp.outputs_per_steps, decoder_outputs.shape[-1])) log('decoder out (1 frame): %d' % mel_outputs.shape[-1]) log('postnet out: %d' % post_outputs.shape[-1]) log('linear out: %d' % linear_outputs.shape[-1])
def initialize(self, inputs, input_lengths, target_feats=None, targets_length=None, targets_stop_token=None, is_training=False, is_validation=False, is_prediction=False): self.is_validation = is_validation self.is_prediction = is_prediction self.is_training = is_training with tf.variable_scope("centaur_encoder"): encoder_outputs, attention_bias = CentaurEncoder( is_training=is_training, src_vocab_size=self.params.src_vocab_size, embedding_size=self.params.embedding_size, output_size=self.params.output_size, conv_layers_num=self.params.encoder_conv_layers_num, cnn_dropout_prob=self.params.cnn_dropout_prob)(inputs) with tf.variable_scope("centaur_decoder"): decoder_predictions, post_net_predictions, alignments, stop_token_logits, sequence_lengths, mag_pred, stop_token_predictions = CentaurDecoder( num_mels=self.params.num_mels, num_freq=self.params.num_freq, conv_layers_num=self.params.decoder_conv_layers_num, reduction_factor=self.params.reduction_factor, decoder_hidden_size=self.params.decoder_hidden_size, prenet_hidden_size=self.params.prenet_hidden_size, attention_layers=self.params.attention_layers, attention_heads=self.params.attention_heads, window_size=self.params.window_size, attention_cnn_dropout_prob=self.params. attention_cnn_dropout_prob, kernel_size=self.params.kernel_size, is_training=is_training, is_prediction=is_prediction, is_validation=is_validation)( targets=target_feats, targets_length=targets_length, encoder_outputs=encoder_outputs, attention_bias=attention_bias, batch_size_per_gpu=self.params.batch_size, duration_max=self.params.max_iters) self.encoder_outputs = encoder_outputs self.alignments = alignments self.decoder_predictions = decoder_predictions self.post_net_predictions = post_net_predictions self.stop_token_predictions = stop_token_predictions self.mag_pred = mag_pred self.sequence_lengths = sequence_lengths self.inputs = inputs self.input_lengths = input_lengths self.target_feats = target_feats self.targets_stop_token = targets_stop_token self.targets_length = targets_length self.all_vars = tf.trainable_variables() log('Initialized Centaur model. Dimensions (? = dynamic shape): ') log(' Train mode: {}'.format(is_training)) log(' Input: {}'.format(inputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' mel out: {}'.format(decoder_predictions.shape)) log(' linear out: {}'.format(mag_pred.shape)) log(' <stop_token> out: {}'.format( stop_token_predictions.shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(' Centaur Parameters {:.3f} Million.'.format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError('no mel targets were provided but token_targets were given') if mel_targets is not None and stop_token_targets is None: raise ValueError('Mel targets are provided without corresponding token_targets') with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings ==> [batch_size, sequence_length, embedding_dim] embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels, scope='encoder_convolutions'), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.zoneout_rate, scope='encoder_LSTM')) encoder_outputs = encoder_cell(embedded_inputs, input_lengths) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') # Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate, scope='decoder_lstm') # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') # <stop_token> projection layer stop_projection = StopProjection(is_training, scope='stop_token_projection') # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection, mask_finished=hp.mask_finished) # Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp.num_mels, hp.outputs_per_step, hp.teacher_forcing_ratio) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) # initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not is_training else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=hp.impute_finished, maximum_iterations=max_iters) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels, scope='postnet_convolutions') # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual # Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.stop_token_prediction = stop_token_prediction self.stop_token_targets = stop_token_targets self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_output_shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape)) log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
def __init__(self, coordinator, data_dirs, hparams, config, batches_per_group, data_type, batch_size): super(DataFeeder, self).__init__() self._coord = coordinator self._hp = hparams self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] self._step = 0 self._offset = defaultdict(lambda: 2) self._batches_per_group = batches_per_group self.rng = np.random.RandomState(config.random_seed) self.data_type = data_type self.batch_size = batch_size self.min_tokens = hparams.min_tokens self.min_n_frame = hparams.reduction_factor * hparams.min_iters self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor self.skip_path_filter = config.skip_path_filter # Load metadata: self.path_dict = get_path_dict( data_dirs, self._hp, config, self.data_type, n_test=self.batch_size, rng=self.rng) self.data_dirs = list(self.path_dict.keys()) self.data_dir_to_id = { data_dir: idx for idx, data_dir in enumerate(self.data_dirs)} data_weight = { data_dir: 1. for data_dir in self.data_dirs } if self._hp.main_data_greedy_factor > 0 and \ any(main_data in data_dir for data_dir in self.data_dirs \ for main_data in self._hp.main_data): for main_data in self._hp.main_data: for data_dir in self.data_dirs: if main_data in data_dir: data_weight[data_dir] += self._hp.main_data_greedy_factor weight_Z = sum(data_weight.values()) self.data_ratio = { data_dir: weight / weight_Z for data_dir, weight in data_weight.items() } log("="*40) log(pprint.pformat(self.data_ratio, indent=4)) log("="*40) #audio_paths = [path.replace("/data/", "/audio/"). \ # replace(".npz", ".wav") for path in self.data_paths] #duration = get_durations(audio_paths, print_detail=False) # Create placeholders for inputs and targets. Don't specify batch size because we want to # be able to feed different sized batches at eval time. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None], 'loss_coeff'), tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), ] # Create queue for buffering data: dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32] self.is_multi_speaker = len(self.data_dirs) > 1 if self.is_multi_speaker: self._placeholders.append( tf.placeholder(tf.int32, [None], 'inputs'), ) dtypes.append(tf.int32) num_worker = 8 if self.data_type == 'train' else 1 queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue') self._enqueue_op = queue.enqueue(self._placeholders) if self.is_multi_speaker: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue() else: self.inputs, self.input_lengths, self.loss_coeff, \ self.mel_targets, self.linear_targets = queue.dequeue() self.inputs.set_shape(self._placeholders[0].shape) self.input_lengths.set_shape(self._placeholders[1].shape) self.loss_coeff.set_shape(self._placeholders[2].shape) self.mel_targets.set_shape(self._placeholders[3].shape) self.linear_targets.set_shape(self._placeholders[4].shape) if self.is_multi_speaker: self.speaker_id.set_shape(self._placeholders[5].shape) else: self.speaker_id = None if self.data_type == 'test': examples = [] while True: for data_dir in self.data_dirs: examples.append(self._get_next_example(data_dir)) #print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) if len(examples) >= self.batch_size: break if len(examples) >= self.batch_size: break self.static_batches = [examples for _ in range(self._batches_per_group)] else: self.static_batches = None
def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ with tf.variable_scope('inference') as scope: is_training = mel_targets is not None and not gta batch_size = tf.shape(inputs)[0] hp = self._hparams # Embeddings embedding_table = tf.get_variable( 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) #Encoder enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training, kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels) #Paper doesn't specify what to do with final encoder state #So we will simply drop it encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths, 'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units, zoneout=hp.zoneout_rate) #Attention attention_cell = AttentionWrapper( DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism zoneout_factor_cell=hp.zoneout_rate, #based on original tacotron architecture zoneout_factor_output=hp.zoneout_rate), is_training), LocationSensitiveAttention(hp.attention_dim, encoder_outputs), alignment_history=True, output_attention=False, name='attention_cell') #Concat Prenet output with context vector concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell) #Decoder layers (attention pre-net + 2 unidirectional LSTM Cells) decoder_cell = unidirectional_LSTM(concat_cell, is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate) #Concat LSTM output with context vector concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell) #Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation) output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step) #Define the helper for our decoder if (is_training or gta) == True: self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) #We'll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters #initial decoder state decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32) #Decode (decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode( CustomDecoder(output_cell, self.helper, decoder_init_state), impute_finished=True, #Cut out padded parts (enabled) maximum_iterations=max_iterations) # Reshape outputs to be one output per entry decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels]) #Compute residual using post-net residual = postnet(decoder_output, is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels) #Project residual to same dimension as mel spectrogram projected_residual = projection(residual, shape=hp.num_mels, scope='residual_projection') #Compute the mel spectrogram mel_outputs = decoder_output + projected_residual #Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs = inputs self.input_lengths = input_lengths self.decoder_output = decoder_output self.alignments = alignments self.mel_outputs = mel_outputs self.mel_targets = mel_targets log('Initialized Tacotron model. Dimensions: ') log(' embedding: {}'.format(embedded_inputs.shape)) log(' enc conv out: {}'.format(enc_conv_outputs.shape)) log(' encoder out: {}'.format(encoder_outputs.shape)) log(' decoder out: {}'.format(decoder_output.shape)) log(' residual out: {}'.format(residual.shape)) log(' projected residual out: {}'.format(projected_residual.shape)) log(' mel out: {}'.format(mel_outputs.shape))