Beispiel #1
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self.batch_size
        r = self._hp.reduction_factor

        if self.static_batches is not None:
            batches = self.static_batches
        else:
            examples = []
            for data_dir in self.data_dirs:
                if self._hp.initial_data_greedy:
                    if self._step < self._hp.initial_phase_step and \
                            any("krbook" in data_dir for data_dir in self.data_dirs):
                        data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0]

                if self._step < self._hp.initial_phase_step:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))]
                else:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
                examples.extend(example)
            examples.sort(key=lambda x: x[-1])

            batches = [examples[i:i+n] for i in range(0, len(examples), n)]
            self.rng.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
            self._step += 1
Beispiel #2
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self.batch_size   # 32
        r = self._hp.reduction_factor  #  4 or 5  min_n_frame,max_n_frame 계산에 사용되었던...

        if self.static_batches is not None:  # 'test'에서는 static_batches를 사용한다. static_batches는 init에서 이미 만들어 놓았다.
            batches = self.static_batches
        else: # 'train'
            examples = []
            for data_dir in self.data_dirs:
                if self._hp.initial_data_greedy:
                    if self._step < self._hp.initial_phase_step and any("krbook" in data_dir for data_dir in self.data_dirs):
                        data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0]

                if self._step < self._hp.initial_phase_step:  # 'initial_phase_step': 8000
                    example = [self._get_next_example(data_dir) for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))]  # _batches_per_group 8,또는 32 만큼의 batch data를 만드낟. 각각의 batch size는 2, 또는 32
                else:
                    example = [self._get_next_example(data_dir) for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
                examples.extend(example)
            examples.sort(key=lambda x: x[-1])  # 제일 마지막 기준이니까,  len(linear_target) 기준으로 정렬

            batches = [examples[i:i+n] for i in range(0, len(examples), n)]
            self.rng.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
        for batch in batches:  # batches는 batch의 묶음이다.
            # test 또는 train mode에 맞게 만든 batches의  batch data를 placeholder에 넘겨준다.
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type)))   # _prepare_batch에서 batch data의 길이를 맞춘다. return 순서 = placeholder순서
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
            self._step += 1
Beispiel #3
0
	def __init__(self, coordinator, metadata_filename, hparams):
		super(Feeder, self).__init__()
		self._coord = coordinator
		self._hparams = hparams
		self._clearner_names = [x.strip() for x in hparams.cleaners.split(',')]
		self._offset = 0

		# Load metadata
		self._datadir = os.path.dirname(metadata_filename)
		print(metadata_filename)
		with open(metadata_filename, encoding='utf-8') as f:
			self._metadata = [line.strip().split('|') for line in f]
			hours = sum([int(x[1]) for x in self._metadata]) * hparams.frame_shift_ms / (3600 * 1000)
			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))

		# Create placeholders for inputs and targets. Don't specify batch size because we want
		# to be able to feed different batch sizes at eval time.
		self._placeholders = [
		tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
		tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
		tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
		#tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets')
		]

		# Create queue for buffering data
		queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32], name='input_queue')
		self._enqueue_op = queue.enqueue(self._placeholders)
		self.inputs, self.input_lengths, self.mel_targets = queue.dequeue()
		self.inputs.set_shape(self._placeholders[0].shape)
		self.input_lengths.set_shape(self._placeholders[1].shape)
		self.mel_targets.set_shape(self._placeholders[2].shape)
Beispiel #4
0
 def make_test_batches(self):
     start = time.time()
     # Read a group of examples
     n = self._hparams.batch_size
     r = self._hparams.reduction_factor
     # Test on entire test set
     examples = []
     examples_list = []
     examples_size = []
     data_ratio = 1.0 / len(self._test_meta_list)
     for idx, _test_meta in enumerate(self._test_meta_list):
         example = [
             self._get_next_example(_test_meta, idx)
             for _ in range(int(n * self._batches_per_group * data_ratio))
         ]
         example.sort(key=lambda x: x[-1])
         examples_size.append(len(example))
         examples_list.append(example)
     examples_size.sort(reverse=True)
     max_step = examples_size[0] if len(examples_size) > 0 else 0
     num_vec = len(examples_size)
     for index in range(max_step):
         for num in range(num_vec):
             if examples_size[num] > index:
                 example = examples_list[num][index]
                 examples.append(example)
     batches = [examples[i:i + n] for i in range(0, len(examples), n)]
     np.random.shuffle(batches)
     log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
         len(batches), n,
         time.time() - start))
     return batches, r
Beispiel #5
0
def wavenet_synthesize(hparams, checkpoint):
	output_dir = hparams.synth_output_dir

	try:
		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
		log('loaded model at {}'.format(checkpoint_path))
	except:
		raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

	run_synthesis(checkpoint_path, output_dir, hparams)
Beispiel #6
0
def synthesize(args, hparams, checkpoint, sentences=None):
    output_dir = 'centaur_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    run_eval(checkpoint_path, output_dir, hparams, sentences)
Beispiel #7
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.batch_size
            # Bucket examples based on similar output sequence length for efficiency
            examples = []  # 存放预训练样本
            examples_list = []  # 总共有多少个人
            examples_size = []  # 每个样本量多大
            data_ratio = 1.0 / len(self._train_meta_list)
            for idx, _train_meta in enumerate(self._train_meta_list):
                if self._start_step < self._hparams.initial_phase_step:  # 'initial_phase_step': 8000
                    example = [
                        self._get_next_example(_train_meta, idx)
                        for _ in range(
                            int(n * self._batches_per_group //
                                len(self.data_dirs)))
                    ]
                else:
                    example = [
                        self._get_next_example(_train_meta, idx)
                        for _ in range(
                            int(n * self._batches_per_group * data_ratio))
                    ]
                example.sort(key=lambda x: x[-1])
                examples_size.append(len(example))
                examples_list.append(example)
            examples_size.sort(reverse=True)
            max_step = examples_size[0] if len(examples_size) > 0 else 0
            num_vec = len(examples_size)
            for index in range(max_step):
                for num in range(num_vec):
                    if examples_size[num] > index:
                        example = examples_list[num][index]
                        examples.append(example)
            batches = [examples[i:i + n] for i in range(0, len(examples), n)]
            np.random.shuffle(batches)

            log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(
                len(batches), n,
                time.time() - start))
            for batch in batches:
                feed_dict = dict(
                    zip(self._placeholders, self._prepare_batch(batch)))
                self._session.run(self._enqueue_op, feed_dict=feed_dict)
                self._start_step += 1
Beispiel #8
0
	def _enqueue_next_group(self):
		start = time.time()

		# Read a group of examples
		n = self._hparams.batch_size
		r = self._hparams.outputs_per_step
		examples = [self._get_next_example() for i in range(n * _batches_per_group)]

		# Bucket examples based on similar output sequence length for efficiency
		examples.sort(key=lambda x: x[-1])
		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
		np.random.shuffle(batches)

		log('\nGenerated {} batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
		for batch in batches:
			feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
			self._session.run(self._enqueue_op, feed_dict=feed_dict)
Beispiel #9
0
    def make_test_batches(self):
        start = time.time()

        #Read one example for evaluation
        n = 1

        #Test on entire test set (one sample at an evaluation step)
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches
Beispiel #10
0
    def _enqueue_next_group(self):
        start = time.time()

        #Read a group of samples
        n = self._hparams.batch_size
        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]

        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
Beispiel #11
0
    def load(self, checkpoint_path, hparams, model_name='WaveNet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=(None, None, hparams.num_mfccs),
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(None, 1),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None
        self.input_lengths = tf.placeholder(
            tf.int32, shape=(1, ),
            name='input_lengths') if hparams.wavenet_synth_debug else None
        self.synth_debug = hparams.wavenet_synth_debug

        with tf.variable_scope('WaveNet_model') as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=self.input_lengths,
                                  synthesis_length=self.synthesis_length,
                                  test_inputs=None)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            #Memory allocation on the GPU as needed
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.allow_soft_placement = True

            self.session = tf.Session(config=config)
            self.session.run(tf.global_variables_initializer())

        load_averaged_model(self.session, sh_saver, checkpoint_path)
Beispiel #12
0
def run_eval(checkpoint_path, output_dir, hparams, sentences):
    # Create output path if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    # Set inputs batch wise
    sentences = [
        sentences[i:i + hparams.synthesis_batch_size]
        for i in range(0, len(sentences), hparams.synthesis_batch_size)
    ]

    log('Starting Synthesis')
    for i, texts in enumerate(tqdm(sentences)):
        basenames = ['{}_sentence_{}'.format(i, j) for j in range(len(texts))]
        synth.synthesize(texts, basenames, output_dir, None)
def get_path_dict(data_dirs,
                  hparams,
                  config,
                  data_type,
                  n_test=None,
                  rng=np.random.RandomState(123)):

    # Load metadata:
    path_dict = {}
    for data_dir in data_dirs:  # ['datasets/moon\\data']
        paths = glob(
            "{}/*.npz".format(data_dir)
        )  # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...]

        if data_type == 'train':
            rng.shuffle(
                paths
            )  # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...]

        if not config.skip_path_filter:
            # items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True)  # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...]
            items = []
            for path in paths:
                item = get_frame(path)
                items.append(item)

            min_n_frame = hparams.min_n_frame  # 5*30
            max_n_frame = hparams.max_n_frame - 1  # 5*200 - 5

            # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨.
            new_items = [
                (path, n) for path, n, n_tokens in items
                if min_n_frame <= n <= max_n_frame
                and n_tokens >= hparams.min_tokens
            ]  # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...]

            new_paths = [path for path, n in new_items]
            new_n_frames = [n for path, n in new_items]

            hours = frames_to_hours(new_n_frames, hparams)

            log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format(
                data_dir, len(new_n_frames), hours))
            log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
            log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
        else:
            new_paths = paths

        # train용 data와 test용 data로 나눈다.
        if data_type == 'train':
            new_paths = new_paths[:-n_test]  # 끝에 있는 n_test(batch_size)를 제외한 모두
        elif data_type == 'test':
            new_paths = new_paths[-n_test:]  # 끝에 있는 n_test
        else:
            raise Exception(" [!] Unkown data_type: {}".format(data_type))

        path_dict[
            data_dir] = new_paths  # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...]

    return path_dict
Beispiel #14
0
    def initialize(self, inputs, targets=None):
        """
		Initializes the model for inference

		set "output" field.

		Args:
			- inputs: int32 tensor with shape [batch_size, time_steps] where time steps
			is typically the number of words in each input sentence
			- targets: int32 tensor with shape [batch_size, num_classes] which represents the true labels.
			Only used in training time. 
		"""
        with tf.variable_scope('inference') as scope:
            is_training = targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            #Embeddings
            embedding_table = tf.get_variable(
                'intputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder
            enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training)
            encoder_outputs, encoder_states = bidirectional_LSTM(
                enc_conv_outputs, 'encoder_LSTM', is_training=is_training)

            #Prediction/projection
            projection_shape = [512, 512]
            projected = projection_layers(inputs,
                                          is_training=is_training,
                                          shape=projection_shape,
                                          activation=tf.nn.relu)

            #Logit Layer
            output = logit_layer(projected, logits_dim=hp.num_classes)

            self.inputs = inputs
            self.output = output
            self.targets = targetst
            og('Initialized Analyser model. Dimensions: ')
            log('  embedding:               {}'.format(
                embedded_inputs.shape[-1]))
            log('  enc conv out:            {}'.format(
                enc_conv_outputs.shape[-1]))
            log('  encoder out:             {}'.format(
                encoder_outputs.shape[-1]))
            log('  output:                  {}'.format(output.shape[-1]))
Beispiel #15
0
def run_synthesis(checkpoint_path, output_dir, hparams):
    log_dir = os.path.join(output_dir, 'plots')
    wav_dir = os.path.join(output_dir, 'wavs')
    embed_dir = os.path.join(output_dir, 'embeddings')


    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    metadata_filename = os.path.join(hparams.wavenet_synth, 'map.txt')
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = np.array([line.strip().split('|') for line in f])
        if (hparams.synth_mode == "all") and (hparams.synth_idx != None):
            # if synth mode is all and synth_idx is not None, extract a part of metadata
            metadata = metadata[hparams.synth_idx[0]:hparams.synth_idx[1], :]


    # speaker ids from trained speakers list
    speaker_ids = metadata[:, 3]
    print("spk_ids" +str(speaker_ids.shape))
    mel_files = metadata[:, 1]
    print("mel_files" +str(mel_files.shape))

    log('Starting synthesis! (this will take a while..)')
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(embed_dir, exist_ok=True)

    synth_dict = load_synthesis_dict()

    for idx, mel_file in enumerate(tqdm(mel_files)):
        print("idx")
        print(idx)
        mel_spectro = [np.load(mel_file)]
        basenames = [os.path.basename(mel_file).replace('.npy', '')]
        speaker_id = [speaker_ids[idx]]
        print("synthesizing {}".format(basenames[0]))

        if hparams.synth_mode == "all":
            if basenames[0].split('-')[1] in synth_dict.keys():
                print("Synthesizing both wav and embedding")
                synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=False)
            else:
                print("Synthesizing embedding only")
                synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=True)
        elif hparams.synth_mode == "embedding":
            print("Synthesizing embedding only")
            synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=True)
        elif hparams.synth_mode == "wav":
            if basenames[0].split('-')[1] in synth_dict.keys():
                synth.synthesize(mel_spectro, speaker_id, basenames, wav_dir, log_dir, embed_dir, embed_only=False)
        else:
            print("Not supported synth mode.")




    log('synthesized audio waveforms at {}'.format(wav_dir))
Beispiel #16
0
    def load(self, checkpoint_path, hparams, freezer=False):
        log('Constructing model: Centaur')
        if freezer:
            try:
                checkpoint_path = tf.train.get_checkpoint_state(checkpoint_path).model_checkpoint_path
            except:
                raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint_path))
        # Force the batch size to be known in order to use attention masking in batch synthesis
        self.inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
        self.input_lengths = tf.placeholder(tf.int32, (None,), name='input_lengths')

        with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
            self.model = create_model(hparams)
            self.model.initialize(self.inputs, self.input_lengths, is_training=False,
                                  is_validation=False, is_prediction=True)
            self.mel_outputs = self.model.decoder_predictions
            self.linear_outputs = self.model.mag_pred
            self.alignments = self.model.alignments
            self.wav_output = self.model.audio
            self.stop_token_prediction = self.model.stop_token_predictions
            self.audio_length = self.model.sequence_lengths

        self._hparams = hparams
        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0

        log('Loading checkpoint: %s' % checkpoint_path)
        # Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
def get_path_dict(data_dirs,
                  hparams,
                  config,
                  data_type,
                  n_test=None,
                  rng=np.random.RandomState(123)):

    # Load metadata:
    path_dict = {}
    for data_dir in data_dirs:
        paths = glob("{}/*.npz".format(data_dir))

        if data_type == 'train':
            rng.shuffle(paths)

        if not config.skip_path_filter:
            items = parallel_run(get_frame,
                                 paths,
                                 desc="filter_by_min_max_frame_batch",
                                 parallel=True)

            min_n_frame = hparams.reduction_factor * hparams.min_iters
            max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor

            new_items = [(path, n) for path, n, n_tokens in items \
                    if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens]

            if any(check in data_dir for check in ["son", "yuinna"]):
                blacklists = [".0000.", ".0001.", "NB11479580.0001"]
                new_items = [item for item in new_items \
                        if any(check not in item[0] for check in blacklists)]

            new_paths = [path for path, n in new_items]
            new_n_frames = [n for path, n in new_items]

            hours = frames_to_hours(new_n_frames)

            log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \
                    format(data_dir, len(new_n_frames), hours))
            log(' [{}] Max length: {}'.format(data_dir,
                                              max(new_n_frames, default=0)))
            log(' [{}] Min length: {}'.format(data_dir,
                                              min(new_n_frames, default=0)))
        else:
            new_paths = paths

        if data_type == 'train':
            new_paths = new_paths[:-n_test]
        elif data_type == 'test':
            new_paths = new_paths[-n_test:]
        else:
            raise Exception(" [!] Unkown data_type: {}".format(data_type))

        path_dict[data_dir] = new_paths

    return path_dict
Beispiel #18
0
def get_path_dict(data_dirs, hparams, config,data_type, n_test=None,rng=np.random.RandomState(123)):

    # Load metadata:
    path_dict = {}
    for data_dir in data_dirs:  # ['datasets/moon\\data']
        paths = glob("{}/*.npz".format(data_dir)) # ['datasets/moon\\data\\001.0000.npz', 'datasets/moon\\data\\001.0001.npz', 'datasets/moon\\data\\001.0002.npz', ...]

        if data_type == 'train':
            rng.shuffle(paths)  # ['datasets/moon\\data\\012.0287.npz', 'datasets/moon\\data\\004.0215.npz', 'datasets/moon\\data\\003.0149.npz', ...]

        if not config.skip_path_filter:
            items = parallel_run( get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True)  # [('datasets/moon\\data\\012.0287.npz', 130, 21), ('datasets/moon\\data\\003.0149.npz', 209, 37), ...]

            min_n_frame = hparams.reduction_factor * hparams.min_iters   # 5*30
            max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor  # 5*200 - 5
            
            # 다음 단계에서 data가 많이 떨어져 나감. 글자수가 짧은 것들이 탈락됨.
            new_items = [(path, n) for path, n, n_tokens in items if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] # [('datasets/moon\\data\\004.0383.npz', 297), ('datasets/moon\\data\\003.0533.npz', 394),...]

            if any(check in data_dir for check in ["son", "yuinna"]):
                blacklists = [".0000.", ".0001.", "NB11479580.0001"]
                new_items = [item for item in new_items if any(check not in item[0] for check in blacklists)]

            new_paths = [path for path, n in new_items]
            new_n_frames = [n for path, n in new_items]

            hours = frames_to_hours(new_n_frames,hparams)

            log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'.format(data_dir, len(new_n_frames), hours))
            log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
            log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
        else:
            new_paths = paths

        if data_type == 'train':
            new_paths = new_paths[:-n_test]
        elif data_type == 'test':
            new_paths = new_paths[-n_test:]
        else:
            raise Exception(" [!] Unkown data_type: {}".format(data_type))

        path_dict[data_dir] = new_paths  # ['datasets/moon\\data\\001.0621.npz', 'datasets/moon\\data\\003.0229.npz', ...]

    return path_dict
Beispiel #19
0
    def __init__(self, coordinator, input_path, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._start_step = 0
        self._batches_per_group = 32
        self._train_offset_list = []
        self._test_offset_list = []
        self._pad = 0
        # Load metadata
        input_paths = [input_path]
        if not os.path.exists(os.path.join(input_path, 'train.txt')):
            path_list = os.listdir(input_path)
            input_paths = []
            for name in path_list:
                input_paths.append(
                    os.path.abspath(os.path.join(input_path, name)))
        self.data_dirs = input_paths
        all_hours = 0.0
        metadata_size = 0
        self._metadata_list = []
        for input_path in input_paths:
            with open(os.path.join(input_path, 'train.txt'),
                      encoding='utf-8') as f:
                metadata_vec = []
                for line in f:
                    npz_filename, time_steps, mel_frames, text = line.strip(
                    ).split('|')
                    metadata_vec.append([
                        os.path.join(input_path,
                                     os.path.basename(npz_filename)),
                        time_steps, mel_frames, text
                    ])
                self._metadata_list.append(metadata_vec)
                frame_shift_ms = hparams.hop_size / hparams.sample_rate
                hours = sum([int(x[2])
                             for x in metadata_vec]) * frame_shift_ms / 3600
                all_hours += hours
                log('Loaded metadata for {} examples ({:.2f} hours)'.format(
                    len(metadata_vec), hours))
                metadata_size += len(metadata_vec)
                self._train_offset_list.append(0)
                self._test_offset_list.append(0)
        log('Loaded ({:.2f} hours)'.format(all_hours))
        # Train test split
        if hparams.test_size is None:
            assert hparams.test_batches is not None
        test_size = (hparams.test_size if hparams.test_size is not None else
                     hparams.test_batches * hparams.batch_size)
        self._train_meta_list = []
        self._test_meta_list = []
        if self._hparams.symmetric_mels:
            self._pad_value = -self._hparams.max_abs_value
        else:
            self._pad_value = 0.
        data_ratio = 1.0 // len(self._metadata_list)
        test_size = test_size * data_ratio
        sum_test_meta = 0
        for metadata in self._metadata_list:
            indices = np.arange(len(metadata))
            train_indices, test_indices = train_test_split(
                indices,
                test_size=test_size,
                random_state=hparams.data_random_state)
            # Make sure test_indices is a multiple of batch_size else round down
            len_test_indices = self._round_down(len(test_indices),
                                                hparams.batch_size)
            extra_test = test_indices[len_test_indices:]
            test_indices = test_indices[:len_test_indices]
            train_indices = np.concatenate([train_indices, extra_test])
            _train_meta = list(np.array(metadata)[train_indices])
            _test_meta = list(np.array(metadata)[test_indices])
            sum_test_meta += len(_test_meta)
            self._train_meta_list.append(_train_meta)
            self._test_meta_list.append(_test_meta)
        self.test_steps = sum_test_meta // hparams.batch_size
        if hparams.test_size is None:
            assert hparams.test_batches == self.test_steps

        with tf.device('/cpu:0'):
            # Create placeholders for inputs and targets. Don't specify batch size because we want
            # to be able to feed different batch sizes at eval time.

            self._placeholders = [
                tf.placeholder(tf.int32, [None, None], 'inputs'),
                tf.placeholder(tf.int32, [None], 'input_lengths'),
                tf.placeholder(
                    tf.float32,
                    [None, None, hparams.num_mels + hparams.num_freq],
                    'target_mels'),
                tf.placeholder(tf.int32, (None, ), 'target_lengths'),
                tf.placeholder(tf.float32, (None, None), 'stop_tokens'),
            ]
            dtypes = [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32]
            # Create queue for buffering data
            queue = tf.FIFOQueue(8, dtypes, name='input_queue')
            self._enqueue_op = queue.enqueue(self._placeholders)

            self.inputs, self.input_lengths, self.target_feats, self.target_lengths, self.stop_tokens = queue.dequeue(
            )
            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.target_feats.set_shape(self._placeholders[2].shape)
            self.target_lengths.set_shape(self._placeholders[3].shape)
            self.stop_tokens.set_shape(self._placeholders[4].shape)

            # Create eval queue for buffering eval data
            eval_queue = tf.FIFOQueue(1, dtypes, name='eval_queue')
            self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)

            self.eval_inputs, self.eval_input_lengths, self.eval_target_feats, self.eval_target_lengths, self.eval_stop_tokens, = eval_queue.dequeue(
            )

            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_target_feats.set_shape(self._placeholders[2].shape)
            self.eval_target_lengths.set_shape(self._placeholders[3].shape)
            self.eval_stop_tokens.set_shape(self._placeholders[4].shape)
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Beispiel #21
0
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):
        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            # [N, T_in, embedding_size]
            char_embedded_inputs = \
                tf.nn.embedding_lookup(char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:

                        def deep_dense(x, dim):                            return \
        tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(char_embedded_inputs,
                                    is_training,
                                    hp.enc_prenet_sizes,
                                    hp.dropout_prob,
                                    scope='prenet')

            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
            for _ in range(hp.dec_layer_num):
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = \
                    tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Beispiel #22
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id=None,
                   mel_targets=None,
                   linear_targets=None,
                   is_training=False,
                   loss_coeff=None,
                   stop_token_targets=None):

        with tf.variable_scope('Eembedding') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                speaker_embed_table = tf.get_variable(
                    'speaker_embedding',
                    [self.num_speakers, hp.speaker_embedding_size],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                # [N, T_in, speaker_embedding_size]
                speaker_embed = tf.nn.embedding_lookup(speaker_embed_table,
                                                       speaker_id)

                deep_dense = lambda x, dim, name: tf.layers.dense(
                    x, dim, activation=tf.nn.softsign, name=name
                )  # softsign: x / (abs(x) + 1)

                encoder_rnn_init_state = deep_dense(
                    speaker_embed, hp.encoder_lstm_units * 4,
                    'encoder_init_dense')  # hp.encoder_lstm_units = 256

                decoder_rnn_init_states = [
                    deep_dense(speaker_embed, hp.decoder_lstm_units * 2,
                               'decoder_init_dense_{}'.format(i))
                    for i in range(hp.decoder_layers)
                ]  # hp.decoder_lstm_units = 1024

                speaker_embed = None
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

        with tf.variable_scope('Encoder') as scope:
            ##############
            # Encoder
            ##############
            x = char_embedded_inputs
            for i in range(hp.enc_conv_num_layers):
                x = tf.layers.conv1d(x,
                                     filters=hp.enc_conv_channels,
                                     kernel_size=hp.enc_conv_kernel_size,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='dropout_{}'.format(i))

            if encoder_rnn_init_state is not None:
                initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split(
                    encoder_rnn_init_state, 4, 1)
                initial_state_fw = LSTMStateTuple(initial_state_fw_c,
                                                  initial_state_fw_h)
                initial_state_bw = LSTMStateTuple(initial_state_bw_c,
                                                  initial_state_bw_h)
            else:  # single mode
                initial_state_fw, initial_state_bw = None, None

            cell_fw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            ##############
            # Attention
            ##############
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=hp.mask_encoder,
                    memory_sequence_length=input_lengths,
                    smoothing=hp.smoothing,
                    cumulate_weights=hp.cumulative_weights)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            decoder_lstm = [
                ZoneoutLSTMCell(hp.decoder_lstm_units,
                                is_training,
                                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                                zoneout_factor_output=hp.tacotron_zoneout_rate,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(hp.decoder_layers)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "multi-speaker":

                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx][0].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1[1] * 2 != shape2[1]:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    c, h = tf.split(cell, 2, 1)
                    decoder_init_state[idx] = LSTMStateTuple(c, h)

                decoder_init_state = tuple(decoder_init_state)

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_prenet_outputs = DecoderWrapper(attention_cell, is_training,
                                                hp.dec_prenet_sizes,
                                                hp.dropout_prob,
                                                hp.inference_prenet_dropout)

            dec_outputs_cell = OutputProjectionWrapper(
                dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor)

            if is_training:
                helper = TacoTrainingHelper(
                    mel_targets, hp.num_mels,
                    hp.reduction_factor)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor))  # max_iters=200

            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor],
                [batch_size, -1, hp.num_mels
                 ])  # [N,iters,400] -> [N,5*iters,80]
            stop_token_outputs = tf.reshape(
                decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:],
                [batch_size, -1])  # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(hp.postnet_num_layers):
                activation = tf.nn.tanh if i != (hp.postnet_num_layers -
                                                 1) else None
                x = tf.layers.conv1d(x,
                                     filters=hp.postnet_channels,
                                     kernel_size=hp.postnet_kernel_size,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq,
                name='linear_spectogram_projection')  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state
            self.stop_token_targets = stop_token_targets
            self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            log('    encoder conv out:               %d' %
                encoder_conv_output.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    decoder prenet lstm concat out :        %d' %
                dec_prenet_outputs.output_size)
            log('    decoder cell out:         %d' %
                dec_outputs_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder mel out:    %d' % decoder_mel_outputs.shape[-1])
            log('    mel out:    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Beispiel #23
0
    def initialize(self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            self.batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, 512]

            # Encoder
            encoder_outputs = conv_and_lstm(
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units=hp.encoder_lstm_units,
                is_training=is_training,
                scope='encoder')  # [N, T_in, 512]

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training),
                LocationSensitiveAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 128]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                concat_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)
            ], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32)
            (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry                                [N, T_out, M]
            decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels])

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                decoder_outputs,
                layers=hp.postnet_conv_layers,
                conv_width=hp.postnet_conv_width,
                channels=hp.postnet_conv_channels,
                is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                mel_outputs,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                lstm_units=hp.expand_lstm_units,
                is_training=is_training,
                scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_outputs = decoder_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  expand out:              %d' % expand_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Beispiel #24
0
    def __init__(self, data_dirs, hparams, config, batches_per_group,
                 data_type, batch_size):
        super(DataFeederTacotron, self).__init__()

        self._hp = hparams
        self._step = 0
        self._offset = defaultdict(lambda: 2)
        self._batches_per_group = batches_per_group

        self.rng = np.random.RandomState(
            config.random_seed)  # random number generator
        self.data_type = data_type
        self.batch_size = batch_size

        self.min_tokens = hparams['min_tokens']  # 30
        self.min_n_frame = hparams['reduction_factor'] * hparams[
            'min_iters']  # 5*30
        self.max_n_frame = hparams['reduction_factor'] * hparams[
            'max_iters'] - hparams['reduction_factor']  # 5*200 - 5

        # Load metadata:
        self.path_dict = get_path_dict(
            data_dirs,
            self._hp,
            config,
            self.data_type,
            n_test=self.batch_size,
            rng=self.rng)  # data_dirs: ['datasets/moon\\data']

        self.data_dirs = list(self.path_dict.keys())  # ['datasets/moon\\data']
        self.data_dir_to_id = {
            data_dir: idx
            for idx, data_dir in enumerate(self.data_dirs)
        }  # {'datasets/moon\\data': 0}

        data_weight = {data_dir: 1.
                       for data_dir in self.data_dirs
                       }  # {'datasets/moon\\data': 1.0}

        weight_Z = sum(data_weight.values())

        self.data_ratio = {
            data_dir: weight / weight_Z
            for data_dir, weight in data_weight.items()
        }
        self.is_multi_speaker = len(self.data_dirs) > 1

        log("=" * 40)
        log(pprint.pformat(self.data_ratio, indent=4))
        log("=" * 40)

        if self.data_type == 'test':
            examples = []
            while True:
                for data_dir in self.data_dirs:
                    examples.append(self._get_next_example(data_dir))
                    # print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
                    if len(examples) >= self.batch_size:
                        break
                if len(examples) >= self.batch_size:
                    break

            # test 할 때는 같은 examples로 계속 반복
            self.static_batches = [
                examples for _ in range(self._batches_per_group)
            ]  # [examples, examples,...,examples] <--- 각 example은 2개의 data를 가지고 있다.

        else:
            self.static_batches = None

        # Read a group of examples:
        n = self.batch_size  # 32
        r = self._hp[
            'reduction_factor']  # 4 or 5  min_n_frame,max_n_frame 계산에 사용되었던...
        start = time.time()

        if self.static_batches is not None:  # 'test'에서는 static_batches를 사용한다. static_batches는 init에서 이미 만들어 놓았다.
            batches = self.static_batches
        else:  # 'train'
            examples = []
            for data_dir in self.data_dirs:
                if self._hp['initial_data_greedy']:
                    if self._step < self._hp['initial_phase_step'] and any(
                            "krbook" in data_dir
                            for data_dir in self.data_dirs):
                        data_dir = [
                            data_dir for data_dir in self.data_dirs
                            if "krbook" in data_dir
                        ][0]

                if self._step < self._hp[
                        'initial_phase_step']:  # 'initial_phase_step': 8000
                    example = [
                        self._get_next_example(data_dir) for _ in range(
                            int(n * self._batches_per_group //
                                len(self.data_dirs)))
                    ]  # _batches_per_group 8,또는 32 만큼의 batch data를 만드낟. 각각의 batch size는 2, 또는 32
                else:
                    example = [
                        self._get_next_example(data_dir) for _ in range(
                            int(n * self._batches_per_group *
                                self.data_ratio[data_dir]))
                    ]
                examples.extend(example)
            examples.sort(key=lambda x: x[-1]
                          )  # 제일 마지막 기준이니까,  len(linear_target) 기준으로 정렬

        self.len = np.shape(examples)[0]
        examples_len = len(examples)
        self.input_data = [examples[i][0] for i in range(examples_len)]
        self.loss_coeff = [examples[i][1] for i in range(examples_len)]
        self.mel_target = [examples[i][2] for i in range(examples_len)]
        self.linear_target = [examples[i][3] for i in range(examples_len)]
        self.stop_token_target = [examples[i][4] for i in range(examples_len)]
        if self.is_multi_speaker:
            self.id = [examples[i][5] for i in range(examples_len)]
            self.linear_target_len = [
                examples[i][6] for i in range(examples_len)
            ]
        else:
            self.linear_target_len = [
                examples[i][5] for i in range(examples_len)
            ]
        log('Generated %d batches of size %d in %.03f sec' %
            (len(examples) // 32, n, time.time() - start))
Beispiel #25
0
    def initialize(self, inputs, input_lengths, mel_targets=None):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Encoder
            enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training)
            # Paper doesn't specify what to do with final encoder state
            # We send them however to the attention mechanism as source state
            # (direct link between source and targets cells)
            encoder_outputs, encoder_states = bidirectional_LSTM(
                enc_conv_outputs,
                input_lengths,
                'encoder_LSTM',
                is_training=is_training)

            # DecoderWrapper
            decoder_cell = TacotronDecoderWrapper(
                unidirectional_LSTM(is_training,
                                    layers=hp.num_decoder_layers,
                                    size=512), is_training)

            # AttentionWrapper on top of TacotronDecoderWrapper
            attention_decoder = AttentionWrapper(
                decoder_cell,
                LocationBasedAttention(hp.attention_dim, encoder_outputs),
                alignment_history=True,
                output_attention=False,
                name='attention_decoder_wrapper')

            # We pass (num_decoder_layers times) encoder final states to the decoder of #layers (num_decoder_layers)
            decoder_init_state = attention_decoder.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=tuple(
                    encoder_states for _ in range(hp.num_decoder_layers)))

            # Define the helper for our decoder
            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            # We"ll only limit decoder time steps during inference (consult hparams.py to modify the value)
            max_iterations = None if is_training else hp.max_iters

            # Decode
            (decoder_output,
             _), final_decoder_state, self.stop_error = dynamic_decode(
                 CustomDecoder(attention_decoder, helper, decoder_init_state),
                 impute_finished=True,
                 maximum_iterations=max_iterations)

            # Compute residual using post-net
            residual = postnet(decoder_output, is_training)

            # Project residual to same dimension as mel spectogram
            proj_dim = hp.num_mels
            projected_residual = projection(residual,
                                            shape=proj_dim,
                                            scope='residual_projection')

            # Compute the mel spectogram
            mel_outputs = decoder_output + projected_residual

            # Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(
                enc_conv_outputs.shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''
        初始化模型以进行inference
        :param inputs:[N,T_in], 其中N为batch_size, T_in是输入时间序列的步数,张量中的值为字符的id
        :param input_lengths:[N], 其中N为batch_size, 张量中的值为每个输入序列的长度
        :param mel_targets:[N,T_out,M], 其中N为batch_size,T_out为输出序列的步数,M为num_mels,张量中的值为mel谱的entries ####
        :param linear_targets:[N,T_out,F],其中N为batch_size,T_out为输出序列的步数,F为num_freq,张量中的值为线性谱的entries####
        :return:
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparam

            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N,T_in,embed_depth=256]

            # encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depth)  # [N,T_in,prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs, input_lengths, is_training,
                hp.encoder_depth)  # [N,T_in,encoder_depth=256]

            # attention
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_depth, encoder_outputs)

            # decoder
            multi_rnn_cell = MultiRNNCell(
                [
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N,T_id,decoder_depth=256]

            # 投影到r个mel谱上(在每一个RNN步上,投影r个输出)
            decoder_cell = TacotronDecoderWrapper(is_training,
                                                  attention_mechanism,
                                                  multi_rnn_cell)

            if is_training:
                helper = TacoTrainHelper(inputs, mel_targets, hp.num_mels,
                                         hp.outputs_per_steps)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_steps)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(
                     OutputProjectionWrapper(
                         decoder_cell, hp.num_mels * hp.outputs_per_steps),
                     helper, decoder_init_state),
                 maxmum_iterations=hp.max_iters)  # [N,T_out/r,M*r]

            # 将输出reshape到每个entry对应一个输出
            mel_outputs = tf.reshape(
                decoder_outputs, [batch_size, -1, hp.num_mels])  # [N,T_out,M]

            post_outputs = post_cbhg(
                mel_outputs, hp.num_mels, is_training,
                hp.postnet_depth)  # [N,T_out,postnet_depth=256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N,T_out,F]

            # 从最终的解码器状态获取对齐信息
            alighments = tf.transpose(
                final_decoder_state.alighment_history.stack(), [1, 2, 0])  ####

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alighments = alighments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('embedding: %d' % embedded_inputs.shape[-1])
            log('prenet out: %d' % prenet_outputs.shape[-1])
            log('encoder out: %d' % encoder_outputs.shape[-1])
            log('decoder out (%d frames): %d' %
                (hp.outputs_per_steps, decoder_outputs.shape[-1]))
            log('decoder out (1 frame): %d' % mel_outputs.shape[-1])
            log('postnet out: %d' % post_outputs.shape[-1])
            log('linear out: %d' % linear_outputs.shape[-1])
Beispiel #27
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   target_feats=None,
                   targets_length=None,
                   targets_stop_token=None,
                   is_training=False,
                   is_validation=False,
                   is_prediction=False):
        self.is_validation = is_validation
        self.is_prediction = is_prediction
        self.is_training = is_training
        with tf.variable_scope("centaur_encoder"):
            encoder_outputs, attention_bias = CentaurEncoder(
                is_training=is_training,
                src_vocab_size=self.params.src_vocab_size,
                embedding_size=self.params.embedding_size,
                output_size=self.params.output_size,
                conv_layers_num=self.params.encoder_conv_layers_num,
                cnn_dropout_prob=self.params.cnn_dropout_prob)(inputs)
        with tf.variable_scope("centaur_decoder"):
            decoder_predictions, post_net_predictions, alignments, stop_token_logits, sequence_lengths, mag_pred, stop_token_predictions = CentaurDecoder(
                num_mels=self.params.num_mels,
                num_freq=self.params.num_freq,
                conv_layers_num=self.params.decoder_conv_layers_num,
                reduction_factor=self.params.reduction_factor,
                decoder_hidden_size=self.params.decoder_hidden_size,
                prenet_hidden_size=self.params.prenet_hidden_size,
                attention_layers=self.params.attention_layers,
                attention_heads=self.params.attention_heads,
                window_size=self.params.window_size,
                attention_cnn_dropout_prob=self.params.
                attention_cnn_dropout_prob,
                kernel_size=self.params.kernel_size,
                is_training=is_training,
                is_prediction=is_prediction,
                is_validation=is_validation)(
                    targets=target_feats,
                    targets_length=targets_length,
                    encoder_outputs=encoder_outputs,
                    attention_bias=attention_bias,
                    batch_size_per_gpu=self.params.batch_size,
                    duration_max=self.params.max_iters)

        self.encoder_outputs = encoder_outputs
        self.alignments = alignments
        self.decoder_predictions = decoder_predictions
        self.post_net_predictions = post_net_predictions

        self.stop_token_predictions = stop_token_predictions
        self.mag_pred = mag_pred
        self.sequence_lengths = sequence_lengths
        self.inputs = inputs
        self.input_lengths = input_lengths
        self.target_feats = target_feats
        self.targets_stop_token = targets_stop_token
        self.targets_length = targets_length
        self.all_vars = tf.trainable_variables()

        log('Initialized Centaur model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Input:                    {}'.format(inputs.shape))
        log('  encoder out:              {}'.format(encoder_outputs.shape))
        log('  mel out:                  {}'.format(decoder_predictions.shape))
        log('  linear out:               {}'.format(mag_pred.shape))
        log('  <stop_token> out:         {}'.format(
            stop_token_predictions.shape))

        # 1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Centaur Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Beispiel #28
0
    def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, gta=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError('no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None:
            raise ValueError('Mel targets are provided without corresponding token_targets')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels, scope='encoder_convolutions'),
                EncoderRNN(is_training, size=hp.encoder_lstm_units,
                           zoneout=hp.zoneout_rate, scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            # For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            # Decoder Parts
            # Attention Decoder Prenet
            prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet')
            # Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs,
                                                             mask_encoder=hp.mask_encoder,
                                                             memory_sequence_length=input_lengths,
                                                             smoothing=hp.smoothing)
            # Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units, zoneout=hp.zoneout_rate, scope='decoder_lstm')
            # Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
            # <stop_token> projection layer
            stop_projection = StopProjection(is_training, scope='stop_token_projection')

            # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(
                prenet,
                attention_mechanism,
                decoder_lstm,
                frame_projection,
                stop_projection,
                mask_finished=hp.mask_finished)

            # Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets,
                                                 hp.num_mels, hp.outputs_per_step, hp.teacher_forcing_ratio)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            # initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            # Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            # Decode
            (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
                CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                impute_finished=hp.impute_finished,
                maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

            # Postnet
            postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels, scope='postnet_convolutions')

            # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            # Project residual to same dimension as mel spectrogram
            # ==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
            projected_residual = residual_projection(residual)

            # Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            # Grab alignments from the final decoder state
            alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
Beispiel #29
0
    def __init__(self, coordinator, data_dirs,
            hparams, config, batches_per_group, data_type, batch_size):
        super(DataFeeder, self).__init__()

        self._coord = coordinator
        self._hp = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._step = 0
        self._offset = defaultdict(lambda: 2)
        self._batches_per_group = batches_per_group

        self.rng = np.random.RandomState(config.random_seed)
        self.data_type = data_type
        self.batch_size = batch_size

        self.min_tokens = hparams.min_tokens
        self.min_n_frame = hparams.reduction_factor * hparams.min_iters
        self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
        self.skip_path_filter = config.skip_path_filter

        # Load metadata:
        self.path_dict = get_path_dict(
                data_dirs, self._hp, config, self.data_type,
                n_test=self.batch_size, rng=self.rng)

        self.data_dirs = list(self.path_dict.keys())
        self.data_dir_to_id = {
                data_dir: idx for idx, data_dir in enumerate(self.data_dirs)}

        data_weight = {
                data_dir: 1. for data_dir in self.data_dirs
        }

        if self._hp.main_data_greedy_factor > 0 and \
                any(main_data in data_dir for data_dir in self.data_dirs \
                                         for main_data in self._hp.main_data):
            for main_data in self._hp.main_data:
                for data_dir in self.data_dirs:
                    if main_data in data_dir:
                        data_weight[data_dir] += self._hp.main_data_greedy_factor

        weight_Z = sum(data_weight.values())
        self.data_ratio = {
                data_dir: weight / weight_Z for data_dir, weight in data_weight.items()
        }

        log("="*40)
        log(pprint.pformat(self.data_ratio, indent=4))
        log("="*40)

        #audio_paths = [path.replace("/data/", "/audio/"). \
        #        replace(".npz", ".wav") for path in self.data_paths]
        #duration = get_durations(audio_paths, print_detail=False)

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.

        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None], 'loss_coeff'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
        ]

        # Create queue for buffering data:
        dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32]

        self.is_multi_speaker = len(self.data_dirs) > 1

        if self.is_multi_speaker:
            self._placeholders.append(
                    tf.placeholder(tf.int32, [None], 'inputs'),
            )
            dtypes.append(tf.int32)

        num_worker = 8 if self.data_type == 'train' else 1
        queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue')

        self._enqueue_op = queue.enqueue(self._placeholders)

        if self.is_multi_speaker:
            self.inputs, self.input_lengths, self.loss_coeff, \
                    self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue()
        else:
            self.inputs, self.input_lengths, self.loss_coeff, \
                    self.mel_targets, self.linear_targets = queue.dequeue()

        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.loss_coeff.set_shape(self._placeholders[2].shape)
        self.mel_targets.set_shape(self._placeholders[3].shape)
        self.linear_targets.set_shape(self._placeholders[4].shape)

        if self.is_multi_speaker:
            self.speaker_id.set_shape(self._placeholders[5].shape)
        else:
            self.speaker_id = None

        if self.data_type == 'test':
            examples = []
            while True:
                for data_dir in self.data_dirs:
                    examples.append(self._get_next_example(data_dir))
                    #print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
                    if len(examples) >= self.batch_size:
                        break
                if len(examples) >= self.batch_size:
                    break
            self.static_batches = [examples for _ in range(self._batches_per_group)]

        else:
            self.static_batches = None
Beispiel #30
0
	def initialize(self, inputs, input_lengths, mel_targets=None, gta=False):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		with tf.variable_scope('inference') as scope:
			is_training = mel_targets is not None and not gta
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams

			# Embeddings
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
				initializer=tf.contrib.layers.xavier_initializer())
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

			#Encoder
			enc_conv_outputs = enc_conv_layers(embedded_inputs, is_training,
				kernel_size=hp.enc_conv_kernel_size, channels=hp.enc_conv_channels)    
			#Paper doesn't specify what to do with final encoder state
			#So we will simply drop it
			encoder_outputs, encoder_states = bidirectional_LSTM(enc_conv_outputs, input_lengths,
				'encoder_LSTM', is_training=is_training, size=hp.encoder_lstm_units,
				zoneout=hp.zoneout_rate)     

			#Attention
			attention_cell = AttentionWrapper(
				DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism
					zoneout_factor_cell=hp.zoneout_rate,							#based on original tacotron architecture
					zoneout_factor_output=hp.zoneout_rate), is_training),
				LocationSensitiveAttention(hp.attention_dim, encoder_outputs),
				alignment_history=True,
				output_attention=False,
				name='attention_cell')

			#Concat Prenet output with context vector
			concat_cell = ConcatPrenetAndAttentionWrapper(attention_cell)

			#Decoder layers (attention pre-net + 2 unidirectional LSTM Cells)
			decoder_cell = unidirectional_LSTM(concat_cell, is_training,
				layers=hp.decoder_layers, size=hp.decoder_lstm_units,
				zoneout=hp.zoneout_rate)

			#Concat LSTM output with context vector
			concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell)

			#Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation)
			output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step)

			#Define the helper for our decoder
			if (is_training or gta) == True:
				self.helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

			#We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
			max_iterations = None if is_training else hp.max_iters

			#initial decoder state
			decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Decode
			(decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode(
				CustomDecoder(output_cell, self.helper, decoder_init_state),
				impute_finished=True, #Cut out padded parts (enabled)
				maximum_iterations=max_iterations)

			# Reshape outputs to be one output per entry 
			decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels])

			#Compute residual using post-net
			residual = postnet(decoder_output, is_training,
				kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels)

			#Project residual to same dimension as mel spectrogram
			projected_residual = projection(residual,
				shape=hp.num_mels,
				scope='residual_projection')

			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.mel_outputs = mel_outputs
			self.mel_targets = mel_targets
			log('Initialized Tacotron model. Dimensions: ')
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_outputs.shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  mel out:                  {}'.format(mel_outputs.shape))