Exemple #1
0
def tacotron_synthesize(args, hparams, checkpoint, text=None, cwd=None):
    output_dir = 'tacotron_' + args.output_dir
    step = ''

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
        step = checkpoint_path.split('/')[-1].split('-')[-1].strip()
        #/home/spurs/tts/project/Tacotron-2/logs-Tacotron-2/taco_pretrained/tacotron_model.ckpt-61000
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if args.mode == 'eval':
        return run_eval(args, checkpoint_path, output_dir, hparams, text, step,
                        cwd)
    elif args.mode == 'synthesis':
        return run_synthesis(args, checkpoint_path, output_dir, hparams)
    else:
        run_live(args, checkpoint_path, hparams)
Exemple #2
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self.batch_size
        r = self._hp.reduction_factor

        if self.static_batches is not None:
            batches = self.static_batches
        else:
            examples = []
            for data_dir in self.data_dirs:
                if self._hp.initial_data_greedy:
                    if self._step < self._hp.initial_phase_step and \
                            any("krbook" in data_dir for data_dir in self.data_dirs):
                        data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0]

                if self._step < self._hp.initial_phase_step:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))]
                else:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
                examples.extend(example)
            examples.sort(key=lambda x: x[-1])

            batches = [examples[i:i+n] for i in range(0, len(examples), n)]
            self.rng.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
            self._step += 1
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (1, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (1), name='input_lengths')

        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels),
                                 name='mel_targets')
        split_infos = tf.placeholder(tf.int32,
                                     shape=(hparams.tacotron_num_gpus, None),
                                     name='split_infos')
        with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
            self.model = create_model(model_name, hparams)
            if gta:
                self.model.initialize(inputs, input_lengths, targets, gta=gta)
            else:
                self.model.initialize(inputs, input_lengths)

            self.mel_outputs = self.model.mel_outputs
            if hparams.predict_linear:
                self.linear_outputs = self.model.linear_outputs

            self.alignments = self.model.alignments
            self.stop_token_prediction = self.model.stop_token_prediction
            self.targets = targets

        self.gta = gta
        self._hparams = hparams
        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn't originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.targets = targets
        self.split_infos = split_infos

        log('Loading checkpoint: %s' % checkpoint_path)
        #Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemple #4
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._offset = 0

        # Load metadata
        self._mel_dir = os.path.join(os.path.dirname(metadata_filename),
                                     'mels')
        self._linear_dir = os.path.join(os.path.dirname(metadata_filename),
                                        'linear')
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|') for line in f]
            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            hours = sum([int(x[4])
                         for x in self._metadata]) * frame_shift_ms / (3600)
            log('Loaded metadata for {} examples ({:.2f} hours)'.format(
                len(self._metadata), hours))

        # Create placeholders for inputs and targets. Don't specify batch size because we want
        # to be able to feed different batch sizes at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
            tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
            tf.placeholder(tf.float32,
                           shape=(None, None, hparams.num_mels),
                           name='mel_targets'),
            tf.placeholder(tf.int32, [None], 'mel_lengths'),
            tf.placeholder(tf.float32,
                           shape=(None, None),
                           name='token_targets'),
            tf.placeholder(tf.float32,
                           shape=(None, None, hparams.num_freq),
                           name='linear_targets'),
        ]

        # Create queue for buffering data
        queue = tf.FIFOQueue(
            8,
            [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32],
            name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.mel_lengths.set_shape(self._placeholders[3].shape)
        self.token_targets.set_shape(self._placeholders[4].shape)
        self.linear_targets.set_shape(self._placeholders[5].shape)
Exemple #5
0
    def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (1, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (1), name='input_lengths')

        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels),
                                 name='mel_targets')
        target_lengths = tf.placeholder(tf.int32, (1), name='target_length')
        gta = True

        #initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None,
        # linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False,
        # is_evaluating=False)

        with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(inputs=inputs,
                                  input_lengths=input_lengths,
                                  mel_targets=targets,
                                  targets_lengths=target_lengths,
                                  gta=gta,
                                  is_evaluating=True)

            self.mel_outputs = self.model.mel_outputs
            self.alignments = self.model.alignments

        self._hparams = hparams

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.targets = targets
        self.target_lengths = target_lengths

        log('Loading checkpoint: %s' % checkpoint_path)
        #Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Exemple #6
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.tacotron_batch_size
        r = self._hparams.outputs_per_step
        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]

        # Bucket examples based on similar output sequence length for efficiency
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
Exemple #7
0
def run_synthesis(args, checkpoint_path, output_dir, hparams):
    GTA = (args.GTA == 'True')
    if GTA:
        synth_dir = os.path.join(output_dir, 'gta')

        #Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)
    else:
        synth_dir = os.path.join(output_dir, 'natural')

        #Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)

    metadata_filename = os.path.join(args.input_dir, 'train.txt')
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, gta=GTA)
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
        log('Loaded metadata for {} examples ({:.2f} hours)'.format(
            len(metadata), hours))

    #Set inputs batch wise
    metadata = [
        metadata[i:i + hparams.tacotron_synthesis_batch_size]
        for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)
    ]

    log('Starting Synthesis')
    mel_dir = os.path.join(args.input_dir, 'mels')
    wav_dir = os.path.join(args.input_dir, 'audio')
    with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
        for i, meta in enumerate(tqdm(metadata)):
            texts = [m[5] for m in meta]
            mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
            wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
            basenames = [
                os.path.basename(m).replace('.npy', '').replace('mel-', '')
                for m in mel_filenames
            ]
            mel_output_filenames, speaker_ids = synth.synthesize(
                texts, basenames, synth_dir, None, mel_filenames)

            for elems in zip(wav_filenames, mel_filenames,
                             mel_output_filenames, speaker_ids, texts):
                file.write('|'.join([str(x) for x in elems]) + '\n')
    log('synthesized mel spectrograms at {}'.format(synth_dir))
    return os.path.join(synth_dir, 'map.txt')
Exemple #8
0
def get_path_dict(
        data_dirs, hparams, config,
        data_type, n_test=None,
        rng=np.random.RandomState(123)):

    # Load metadata:
    path_dict = {}
    for data_dir in data_dirs:
        paths = glob("{}/*.npz".format(data_dir))

        if data_type == 'train':
            rng.shuffle(paths)

        if not config.skip_path_filter:
            items = parallel_run(
                    get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True)

            min_n_frame = hparams.reduction_factor * hparams.min_iters
            max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
            
            new_items = [(path, n) for path, n, n_tokens in items \
                    if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens]

            if any(check in data_dir for check in ["son", "yuinna"]):
                blacklists = [".0000.", ".0001.", "NB11479580.0001"]
                new_items = [item for item in new_items \
                        if any(check not in item[0] for check in blacklists)]

            new_paths = [path for path, n in new_items]
            new_n_frames = [n for path, n in new_items]

            hours = frames_to_hours(new_n_frames)

            log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \
                    format(data_dir, len(new_n_frames), hours))
            log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
            log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
        else:
            new_paths = paths

        if data_type == 'train':
            new_paths = new_paths[:-n_test]
        elif data_type == 'test':
            new_paths = new_paths[-n_test:]
        else:
            raise Exception(" [!] Unkown data_type: {}".format(data_type))

        path_dict[data_dir] = new_paths

    return path_dict
Exemple #9
0
def run_eval(args, checkpoint_path, output_dir, hparams, text, step, cwd):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    if args.model == 'Tacotron-2':
        assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    #Create output path if it doesn't exist
    #os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    #os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    log('Starting Synthesis')
    synth.synthesize(text, step, eval_dir, log_dir, None, cwd)

    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Exemple #10
0
def run_live(args, checkpoint_path, hparams):
    #Log to Terminal without keeping any records in files
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #Generate fast greeting message
    greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
    log(greetings)
    generate_fast(synth, greetings)

    #Interaction loop
    while True:
        try:
            text = input()
            generate_fast(synth, text)

        except KeyboardInterrupt:
            leave = 'Thank you for testing our features. see you soon.'
            log(leave)
            generate_fast(synth, leave)
            sleep(2)
            break
Exemple #11
0
def train(args, log_dir, hparams):
    log('\n#############################################################\n')
    log('Tacotron Train\n')
    log('###########################################################\n')
    tacotron_train(args, log_dir, hparams)
Exemple #12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   mel_lengths=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   gta=False,
                   reference_mel=None):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if gta == False and self._hparams.predict_linear == True and linear_targets is None:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
            if hp.use_vae:
                if is_training:
                    reference_mel = mel_targets

                style_embeddings, mu, log_var = VAE(inputs=reference_mel,
                                                    input_lengths=mel_lengths,
                                                    filters=hp.filters,
                                                    kernel_size=(3, 3),
                                                    strides=(2, 2),
                                                    num_units=hp.vae_dim,
                                                    is_training=is_training,
                                                    scope='vae')

                self.mu = mu
                self.log_var = log_var
                style_embeddings = tf.layers.dense(style_embeddings,
                                                   hp.encoder_depth)
                style_embeddings = tf.expand_dims(style_embeddings, axis=1)
                style_embeddings = tf.tile(
                    style_embeddings,
                    [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
                encoder_outputs = encoder_outputs + style_embeddings

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                    EncoderConvolutions(is_training,
                                        kernel_size=hp.enc_conv_kernel_size,
                                        channels=hp.enc_conv_channels,
                                        scope='post_processing_convolutions'),
                    EncoderRNN(is_training,
                               size=hp.encoder_lstm_units,
                               zoneout=hp.tacotron_zoneout_rate,
                               scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(
                    hp.num_freq,
                    scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.reference_mel = reference_mel
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.mel_lengths = mel_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):

        hp = self._hparams
        batch_size = tf.shape(inputs)[0]
        gta = False
        self.num_atten = 5

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        with tf.variable_scope('inference') as scope:
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            self.embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32)

            embedded_inputs = tf.nn.embedding_lookup(self.embedding_table,
                                                     inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            self.enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism

            attention_mechanism = ForwardLocationSensitiveAttention(
                hp.attention_dim,
                self.encoder_outputs,
                hparams=hp,
                is_training=is_training or is_evaluating,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)

            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp, input_lengths)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            self.decoder_output = tf.reshape(frames_prediction,
                                             [batch_size, -1, hp.num_mels])
            self.stop_token_prediction = tf.reshape(stop_token_prediction,
                                                    [batch_size, -1])

            if hp.clip_outputs:
                self.decoder_output = tf.minimum(
                    tf.maximum(self.decoder_output,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(self.decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            self.projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            self.mel_outputs = self.decoder_output + self.projected_residual

            if hp.clip_outputs:
                self.mel_outputs = tf.minimum(
                    tf.maximum(self.mel_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            if hp.predict_linear:
                # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                post_cbhg = CBHG(hp.cbhg_kernels,
                                 hp.cbhg_conv_channels,
                                 hp.cbhg_pool_size,
                                 [hp.cbhg_projection, hp.num_mels],
                                 hp.cbhg_projection_kernel_size,
                                 hp.cbhg_highwaynet_layers,
                                 hp.cbhg_highway_units,
                                 hp.cbhg_rnn_units,
                                 hp.batch_norm_position,
                                 is_training,
                                 name='CBHG_postnet')

                #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                self.post_outputs = post_cbhg(self.mel_outputs, None)

                #Linear projection of extracted features to make linear spectrogram
                linear_specs_projection = FrameProjection(
                    hp.num_freq, scope='cbhg_linear_specs_projection')

                #[batch_size, decoder_steps(linear_frames), num_freq]
                self.linear_outputs = linear_specs_projection(
                    self.post_outputs)

                if hp.clip_outputs:
                    self.linear_outputs = tf.minimum(
                        tf.maximum(self.linear_outputs,
                                   T2_output_range[0] - hp.lower_bound_decay),
                        T2_output_range[1])

            #Grab alignments from the final decoder state
            self.alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            log('initialisation done.')

        if is_training:
            self.ratio = self.helper._ratio

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        self.targets_lengths = targets_lengths
        self.stop_token_targets = stop_token_targets
        self.gta = gta
        self.all_vars = tf.trainable_variables()
        self.is_training = is_training
        self.is_evaluating = is_evaluating
        self.fine_tune_params = [
            v for v in self.all_vars
            if not ('inputs_embedding' in v.name or 'encoder_' in v.name)
        ]

        self.final_params = self.all_vars if not hp.tacotron_fine_tuning else self.fine_tune_params

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        log('  embedding:                {}'.format(embedded_inputs.shape))
        log('  enc conv out:             {}'.format(
            self.enc_conv_output_shape))
        log('  encoder out:              {}'.format(
            self.encoder_outputs.shape))
        log('  decoder out:              {}'.format(self.decoder_output.shape))
        log('  residual out:             {}'.format(residual.shape))
        log('  projected residual out:   {}'.format(
            self.projected_residual.shape))
        log('  mel out:                  {}'.format(self.mel_outputs.shape))
        if hp.predict_linear:
            log('  linear out:               {}'.format(
                self.linear_outputs.shape))

        log('  <stop_token> out:         {}'.format(
            self.stop_token_prediction.shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
        log(' fine tune paarmaters:      {:.3f} Million.'.format(
            np.sum([
                np.prod(v.get_shape().as_list()) for v in self.fine_tune_params
            ]) / 1000000))
        log(' final  paarmaters:      {:.3f} Million.'.format(
            np.sum(
                [np.prod(v.get_shape().as_list())
                 for v in self.final_params]) / 1000000))
Exemple #14
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Exemple #15
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._train_offset = 0
        self._test_offset = 0

        # Load metadata
        self._mel_dir = os.path.dirname(metadata_filename)
        self._linear_dir = os.path.dirname(metadata_filename)  #, 'linear')
        dura = 0
        self._metadata = []
        with open(metadata_filename, encoding='utf-8') as f:
            for line in f:
                #audio-000001.npy|mel-000001.npy|46200|168|卡尔普陪外孙玩滑梯。|k a3 er3 p u3 p ei2 w ai4 s un1 w an2 h ua2 t i1 。
                line = line.strip().split('|')
                mel = line[1].strip()
                dura += int(line[3])
                pyin = line[-1].strip()
                self._metadata.append([mel, pyin])

            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            hours = dura * frame_shift_ms / (3600)
            log('Loaded metadata for {} examples ({:.2f} hours)'.format(
                len(self._metadata), hours))

        self._train_meta = self._metadata
        print(len(self._train_meta), '*' * 100)

        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn't originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        #Mark finished sequences with 1s
        self._token_pad = 1.

        # Create placeholders for inputs and targets. Don't specify batch size because we want
        # to be able to feed different batch sizes at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
            tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
            tf.placeholder(tf.float32,
                           shape=(None, None, hparams.num_mels),
                           name='mel_targets'),
            tf.placeholder(tf.float32,
                           shape=(None, None),
                           name='token_targets'),
            tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
        ]

        # Create queue for buffering data
        queue = tf.FIFOQueue(
            8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32],
            name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.targets_lengths = queue.dequeue(
        )

        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.token_targets.set_shape(self._placeholders[3].shape)
        self.targets_lengths.set_shape(self._placeholders[4].shape)
Exemple #16
0
    def __init__(self, coordinator, data_dirs,
            hparams, config, batches_per_group, data_type, batch_size):
        super(DataFeeder, self).__init__()

        self._coord = coordinator
        self._hp = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._step = 0
        self._offset = defaultdict(lambda: 2)
        self._batches_per_group = batches_per_group

        self.rng = np.random.RandomState(config.random_seed)
        self.data_type = data_type
        self.batch_size = batch_size

        self.min_tokens = hparams.min_tokens
        self.min_n_frame = hparams.reduction_factor * hparams.min_iters
        self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
        self.skip_path_filter = config.skip_path_filter

        # Load metadata:
        self.path_dict = get_path_dict(
                data_dirs, self._hp, config, self.data_type,
                n_test=self.batch_size, rng=self.rng)

        self.data_dirs = list(self.path_dict.keys())
        self.data_dir_to_id = {
                data_dir: idx for idx, data_dir in enumerate(self.data_dirs)}

        data_weight = {
                data_dir: 1. for data_dir in self.data_dirs
        }

        if self._hp.main_data_greedy_factor > 0 and \
                any(main_data in data_dir for data_dir in self.data_dirs \
                                         for main_data in self._hp.main_data):
            for main_data in self._hp.main_data:
                for data_dir in self.data_dirs:
                    if main_data in data_dir:
                        data_weight[data_dir] += self._hp.main_data_greedy_factor

        weight_Z = sum(data_weight.values())
        self.data_ratio = {
                data_dir: weight / weight_Z for data_dir, weight in data_weight.items()
        }

        log("="*40)
        log(pprint.pformat(self.data_ratio, indent=4))
        log("="*40)

        #audio_paths = [path.replace("/data/", "/audio/"). \
        #        replace(".npz", ".wav") for path in self.data_paths]
        #duration = get_durations(audio_paths, print_detail=False)

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.

        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None], 'loss_coeff'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
        ]

        # Create queue for buffering data:
        dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32]

        self.is_multi_speaker = len(self.data_dirs) > 1

        if self.is_multi_speaker:
            self._placeholders.append(
                    tf.placeholder(tf.int32, [None], 'inputs'),
            )
            dtypes.append(tf.int32)

        num_worker = 8 if self.data_type == 'train' else 1
        queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue')

        self._enqueue_op = queue.enqueue(self._placeholders)

        if self.is_multi_speaker:
            self.inputs, self.input_lengths, self.loss_coeff, \
                    self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue()
        else:
            self.inputs, self.input_lengths, self.loss_coeff, \
                    self.mel_targets, self.linear_targets = queue.dequeue()

        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.loss_coeff.set_shape(self._placeholders[2].shape)
        self.mel_targets.set_shape(self._placeholders[3].shape)
        self.linear_targets.set_shape(self._placeholders[4].shape)

        if self.is_multi_speaker:
            self.speaker_id.set_shape(self._placeholders[5].shape)
        else:
            self.speaker_id = None

        if self.data_type == 'test':
            examples = []
            while True:
                for data_dir in self.data_dirs:
                    examples.append(self._get_next_example(data_dir))
                    #print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
                    if len(examples) >= self.batch_size:
                        break
                if len(examples) >= self.batch_size:
                    break
            self.static_batches = [examples for _ in range(self._batches_per_group)]

        else:
            self.static_batches = None
Exemple #17
0
    def initialize(self, inputs, input_lengths, mel_targets=None,
                   stop_token_targets=None, linear_targets=None,
                   targets_lengths=None, gta=False, global_step=None,
                   is_training=False, is_evaluating=False):
        """
        Initializes the model for inference

        sets "mel_outputs" and "alignments" fields.

        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError('no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError('Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
            raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
        if gta and linear_targets is not None:
            raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
        if is_training and is_evaluating:
            raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            if hp.tacotron_curriculum_dropout_rate:
                assert global_step is not None
                self.dropout_rate = self._curriculum_dropout(
                    hp.tacotron_dropout_rate,
                    hp.tacotron_curriculum_dropout_gamma,
                    global_step)
            else:
                self.dropout_rate = tf.convert_to_tensor(
                    hp.tacotron_dropout_rate)

            if hp.tacotron_curriculum_zoneout_rate:
                assert global_step is not None
                self.zoneout_rate = self._curriculum_dropout(
                    hp.tacotron_zoneout_rate,
                    hp.tacotron_curriculum_zoneout_gamma,
                    global_step)
            else:
                self.zoneout_rate = tf.convert_to_tensor(
                    hp.tacotron_zoneout_rate)

            assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hp.enc_conv_kernel_size,
                                    hp.enc_conv_channels,
                                    hp.enc_conv_num_layers,
                                    self.dropout_rate,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training, size=hp.encoder_lstm_units,
                    zoneout=self.zoneout_rate, scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape


            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=self.dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
                mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
                size=hp.decoder_lstm_units, zoneout=self.zoneout_rate, scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(
                prenet,
                attention_mechanism,
                decoder_lstm,
                frame_projection,
                stop_projection)


            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                if mel_targets is not None and stop_token_targets is not None:
                    self.helper = TacoTrainingHelper(
                        batch_size, mel_targets, stop_token_targets, hp, gta,
                        is_evaluating, global_step)
                else:
                    if gta:
                        log('Warning: gta set to True but mel_targets or '
                            + 'mel_targets or stop_token_targets not provided'
                            + ', falling back to natural inference')
                    self.helper = TacoTestHelper(batch_size, hp)
            else:
                self.helper = TacoTestHelper(batch_size, hp)


            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
                CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                impute_finished=False,
                maximum_iterations=max_iters,
                swap_memory=hp.tacotron_swap_with_cpu)


            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])


            #Postnet
            postnet = Postnet(is_training,
                              hp.postnet_kernel_size,
                              hp.postnet_channels,
                              hp.postnet_num_layers,
                              self.dropout_rate,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
            projected_residual = residual_projection(residual)


            #Compute the mel spectrogram
            mel_outputs = tf.add(decoder_output, projected_residual,
                                 name='mel_outputs')


            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hp.enc_conv_kernel_size,
                                    hp.enc_conv_channels,
                                    hp.enc_conv_num_layers,
                                    self.dropout_rate,
                                    scope='post_processing_convolutions'),
                EncoderRNN(is_training, size=hp.encoder_lstm_units,
                    zoneout=self.zoneout_rate, scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0],
                name='alignments')

            self.optimize = None
            self.loss = None
            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
Exemple #18
0
import tensorflow as tf
from tacotron.utils.symbols import symbols
from tacotron.utils.infolog import log
from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
from tacotron.models.modules import *
from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell
from tensorflow.contrib.seq2seq import dynamic_decode
from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from tacotron.models.custom_decoder import CustomDecoder

if int(tf.__version__.replace('.', '')) < 160:
    log('using old attention Tensorflow structure (1.5.0 and earlier)')
    from tacotron.models.attention_old import LocationSensitiveAttention
else:
    log('using new attention Tensorflow structure (1.6.0 and later)')
    from tacotron.models.attention import LocationSensitiveAttention


class Tacotron():
    """Tacotron-2 Feature prediction Model.
	"""
    def __init__(self, hparams):
        self._hparams = hparams

    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """