Ejemplo n.º 1
0
def tacotron_synthesize(args, hparams, checkpoint):
    output_dir = 'tacotron_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        # Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
        if 'Both' in checkpoint:
            checkpoint = checkpoint.replace('Both', 'Tacotron-2')
        elif 'Tacotron-2' in checkpoint:
            checkpoint = checkpoint.replace('Tacotron-2', 'Both')
        else:
            raise AssertionError(
                'Cannot restore checkpoint: {}, did you train a model?'.format(
                    checkpoint))
        try:
            # Try loading again
            checkpoint_path = tf.train.get_checkpoint_state(
                checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError(
                'Failed to load checkpoint at {}'.format(checkpoint))
    return run_synthesis(args, checkpoint_path, output_dir, hparams)
Ejemplo n.º 2
0
    def load(self, checkpoint_path, hparams, model_name='WaveNet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=[1, None, hparams.num_mels],
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None

        with tf.variable_scope('model') as scope:
            self.model = Wavenet(hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=None,
                                  synthesis_length=self.synthesis_length)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            load_averaged_model(self.session, sh_saver, checkpoint_path)
Ejemplo n.º 3
0
def run_inference(checkpoint_path, output_dir, hparams, sentences):
    inference_dir = os.path.join(output_dir, 'inference')
    log_dir = os.path.join(output_dir, 'logs-inference')
    os.makedirs(inference_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
    log('running inference..')
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, GTA=False)
    sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(
            0, len(sentences), hparams.tacotron_synthesis_batch_size)
    ]
    ### save synthesized info to map.txt and folder
    with open(os.path.join(inference_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            basenames = [
                'batch_{}_sentence_{}'.format(i, j) for j in range(len(text))
            ]
            mel_filename = synth.synthesize(text, basenames, inference_dir,
                                            log_dir, None)
            file.write('{}|{}\n'.format(text, mel_filename))
    log('synthesized mel spectrograms of \"{}\" at {}'.format(
        sentences, inference_dir))

    return inference_dir
Ejemplo n.º 4
0
    def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir,
                   log_dir):
        hparams = self._hparams
        local_cond, global_cond = self._check_conditions()
        #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
        audio_lengths = [
            len(x) * get_hop_size(self._hparams) for x in mel_spectrograms
        ]
        #Prepare local condition batch
        maxlen = max([len(x) for x in mel_spectrograms])
        #[-max, max] or [0,max]
        T2_output_range = (
            -self._hparams.max_abs_value,
            self._hparams.max_abs_value) if self._hparams.symmetric_mels else (
                0, self._hparams.max_abs_value)
        c_batch = np.stack([
            _pad_inputs(x, maxlen, _pad=T2_output_range[0])
            for x in mel_spectrograms
        ]).astype(np.float32)
        if self._hparams.normalize_for_wavenet:
            #rerange to [0, 1]
            c_batch = np.interp(c_batch, T2_output_range, (0, 1))
        g = None if speaker_ids is None else np.asarray(
            speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
        feed_dict = {}
        if local_cond:
            feed_dict[self.local_conditions] = c_batch
        else:
            feed_dict[self.synthesis_length] = 100
        if global_cond:
            feed_dict[self.global_conditions] = g

        #Generate wavs and clip extra padding to select Real speech parts

        generated_wavs = self.session.run(
            self.model.y_hat,
            feed_dict=feed_dict)  #### todo: problem here <<<<<<==========

        generated_wavs = [
            generated_wav[:length]
            for generated_wav, length in zip(generated_wavs, audio_lengths)
        ]
        audio_filenames = []
        for i, generated_wav in enumerate(generated_wavs):
            #Save wav to disk
            audio_filename = os.path.join(
                out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
            save_wavenet_wav(generated_wav,
                             audio_filename,
                             sr=hparams.sample_rate)
            audio_filenames.append(audio_filename)
            #Save waveplot to disk
            if log_dir is not None:
                plot_filename = os.path.join(
                    log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
                log('debug: synthesizer.py line 99, plot_filename={}'.format(
                    plot_filename))
                waveplot(plot_filename, generated_wav, None, hparams)
        return audio_filenames
Ejemplo n.º 5
0
def tacotron_synthesize(args, hparams, checkpoint):
    output_dir = args.output_dir
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))
    return run_synthesis(args, checkpoint_path, output_dir, hparams)
Ejemplo n.º 6
0
 def make_test_batches(self):
     start = time.time()
     # Read a group of examples
     n = self._hparams.tacotron_batch_size
     r = self._hparams.outputs_per_step
     # Test on entire test set
     examples = [self._get_test_groups() for i in range(len(self._test_meta))]
     # Bucket examples based on similar output sequence length for efficiency
     examples.sort(key=lambda x: x[-1])
     batches = [examples[i: i + n] for i in range(0, len(examples), n)]
     np.random.shuffle(batches)
     log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
     return batches, r
Ejemplo n.º 7
0
def tacotron_inference(args, hparams, checkpoint, sentences):
    output_dir = args.output_dir
    if sentences is None:
        raise RuntimeError(
            'Inference mode requires input sentence(s), make sure you put sentences in sentences.txt!'
        )

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))
    return run_inference(checkpoint_path, output_dir, hparams, sentences)
Ejemplo n.º 8
0
 def _enqueue_next_train_group(self):
     while not self._coord.should_stop():
         start = time.time()
         # Read a group of examples
         n = self._hparams.tacotron_batch_size
         r = self._hparams.outputs_per_step
         examples = [self._get_next_example() for i in range(n * _batches_per_group)]
         # Bucket examples based on similar output sequence length for efficiency
         examples.sort(key=lambda x: x[-1])
         batches = [examples[i: i + n] for i in range(0, len(examples), n)]
         np.random.shuffle(batches)
         log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
         for batch in batches:
             feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
             self._session.run(self._enqueue_op, feed_dict=feed_dict)
    def make_test_batches(self):
        start = time.time()

        #Read one example for evaluation
        n = 1

        #Test on entire test set (one sample at an evaluation step)
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches
    def load(self, checkpoint_path, hparams, GTA=False, reference_mel=None, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
        targets = tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets')

        targets_lengths = tf.placeholder(tf.int32, (None, hparams.num_mels), name='targets_lengths')
        if reference_mel is not None:
            reference_mel = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'reference_mel')

        split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos')
        with tf.variable_scope('model'):
            self.model = Tacotron(hparams)
            if GTA:
                self.model.initialize(inputs=inputs, input_lengths=input_lengths, mel_targets=targets, GTA=GTA, split_infos=split_infos,reference_mel=reference_mel)
            else:
                self.model.initialize(inputs=inputs, input_lengths=input_lengths, split_infos=split_infos,reference_mel=reference_mel)
            self.mel_outputs = self.model.tower_mel_outputs
            self.alignment = self.model.tower_alignments
            self.stop_token = self.model.tower_stop_token_prediction
            self.targets = targets
            self.encoder_outputs = self.model.encoder_outputs

        self.GTA = GTA
        self.hparams = hparams

        log('Loading checkpoint: %s' % checkpoint_path)
        # Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)

        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)


        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = targets
        self.split_infos = split_infos
Ejemplo n.º 11
0
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams):
    log('\nSaving intermediate states at step {}'.format(global_step))
    idx = 0
    y_hat, y, length = sess.run([model.y_hat_log[idx], model.y_log[idx], model.input_lengths[idx]])

    # mask by length
    y_hat[length:] = 0
    y[length:] = 0

    # Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))

    # Save audio
    librosa.output.write_wav(pred_wav_path, y_hat, sr=hparams.sample_rate)
    librosa.output.write_wav(target_wav_path, y, sr=hparams.sample_rate)

    # Save figure
    waveplot(plot_path, y_hat, y, hparams)
Ejemplo n.º 12
0
    def add_loss(self):
        '''Adds loss computation to the graph. Supposes that initialize function has already been called.
		'''
        with tf.variable_scope('loss') as scope:
            if self.is_training:
                if is_mulaw_quantize(self._hparams.input_type):
                    self.loss = MaskedCrossEntropyLoss(self.y_hat_q[:, :-1, :],
                                                       self.y[:, 1:],
                                                       mask=self.mask)
                else:
                    if self._hparams.out_channels == 2:
                        self.loss = GaussianMaximumLikelihoodEstimation(
                            self.y_hat[:, :, :-1],
                            self.y[:, 1:, :],
                            hparams=self._hparams,
                            mask=self.mask)
                    else:

                        # self.loss = DiscretizedMixtureLogisticLoss(self.y_hat[:, :, :-1], self.y, hparams=self._hparams, mask=self.mask)
                        self.loss = DiscretizedMixtureLogisticLoss(
                            self.y_hat[:, :, :-1],
                            self.y[:, 1:, :],
                            hparams=self._hparams,
                            mask=self.mask)
            elif self.is_evaluating:
                if is_mulaw_quantize(self._hparams.input_type):
                    self.eval_loss = MaskedCrossEntropyLoss(
                        self.y_hat_eval,
                        self.y_eval,
                        lengths=[self.eval_length])
                    log('debug: wavenet.py line 411, self.y_hat_eval={}'.
                        format(self.y_hat_eval))
                    log('debug: wavenet.py line 411, self.y_eval={}'.format(
                        self.y_eval))
                    log('debug: wavenet.py line 411, self.eval_length={}'.
                        format(self.eval_length))
                    log('debug: wavenet.py line 411, self.eval_loss={}'.format(
                        self.eval_loss))
                else:
                    if self._hparams.out_channels == 2:
                        self.eval_loss = GaussianMaximumLikelihoodEstimation(
                            self.y_hat_eval,
                            self.y_eval,
                            hparams=self._hparams,
                            lengths=[self.eval_length])
                    else:
                        self.eval_loss = DiscretizedMixtureLogisticLoss(
                            self.y_hat_eval,
                            self.y_eval,
                            hparams=self._hparams,
                            lengths=[self.eval_length])
Ejemplo n.º 13
0
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer,
              hparams):
    '''Evaluate model during training.
	Supposes that model variables are averaged.
	'''
    start_time = time.time()
    y_hat, y_target, loss = sess.run(
        [model.y_hat, model.y_target, model.eval_loss])
    duration = time.time() - start_time
    log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'
        .format(len(y_target), duration,
                len(y_target) / duration))

    pred_wav_path = os.path.join(wav_dir,
                                 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir,
                                   'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir,
                             'step-{}-waveplot.png'.format(global_step))

    # Save Audio
    wavfile.write(pred_wav_path, hparams.sample_rate, y_hat)
    wavfile.write(target_wav_path, hparams.sample_rate, y_target)

    # Save figure
    util.waveplot(plot_path, y_hat, y_target, model._hparams)
    log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))
    if summary_writer is not None:
        log('Writing eval summary!')
        add_test_stats(summary_writer, global_step, loss)
Ejemplo n.º 14
0
def run_synthesis(args, checkpoint_path, output_dir, hparams):
    GTA = (args.GTA == 'True')
    if GTA:
        synth_dir = os.path.join(output_dir, 'gta')
        # Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)
    else:
        synth_dir = os.path.join(output_dir, 'natural')
        # Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)

    metadata_filename = os.path.join(args.input_dir, 'train.txt')
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, GTA=GTA)
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
        log('Loaded metadata for {} examples ({:.2f} hours)'.format(
            len(metadata), hours))
    time.sleep(1)
    log('starting synthesis..')
    mel_dir = os.path.join(args.input_dir, 'mels')
    wav_dir = os.path.join(args.input_dir, 'audio')
    with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
        for i, meta in enumerate(tqdm(metadata)):
            text = meta[5]
            mel_filename = os.path.join(mel_dir, meta[1])
            wav_filename = os.path.join(wav_dir, meta[0])
            mel_output_filename = synth.synthesize(text, i + 1, synth_dir,
                                                   None, mel_filename)

            file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename,
                                              mel_output_filename, text))
    print('done!')
    time.sleep(1)
    print('Predicted mel spectrograms are saved in {}'.format(synth_dir))
    print('Exitting...')
    time.sleep(3)
    return os.path.join(synth_dir, 'map.txt')
Ejemplo n.º 15
0
def run_inference(checkpoint_path, output_dir, hparams, sentences):
    print('creating folders for inference..')
    inference_dir = os.path.join(output_dir, 'inference')
    log_dir = os.path.join(output_dir, 'logs-inference')
    os.makedirs(inference_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
    time.sleep(1)
    print('done!')
    time.sleep(1)
    print('running inference..')
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    with open(os.path.join(inference_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            mel_filename = synth.synthesize(text, i + 1, inference_dir, log_dir, None)

            file.write('{}|{}\n'.format(text, mel_filename))
    log('synthesized mel spectrograms of \"{}\" at {}'.format(sentences,inference_dir))
    return inference_dir
Ejemplo n.º 16
0
    def load(self, checkpoint_path, hparams, GTA=False, model_name='Tacotron'):
        log('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
        with tf.variable_scope('model') as scope:
            self.model = Tacotron(hparams)
            if GTA:
                self.model.initialize(inputs, input_lengths, targets, GTA=GTA)
            else:
                self.model.initialize(inputs, input_lengths)
            self.mel_outputs = self.model.mel_outputs
            self.alignment = self.model.alignments[0]

        self.gta = GTA
        self._hparams = hparams

        log('Loading checkpoint: %s' % checkpoint_path)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
    def load(self, checkpoint_path, hparams, model_name='wavenet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=[None, None, hparams.num_mels],
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(None, 1),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None

        with tf.variable_scope('model') as scope:
            self.model = wavenet(hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=None,
                                  synthesis_length=self.synthesis_length)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            # Memory allocation on the GPU as needed
            config = tf.ConfigProto()
            config.allow_soft_placement = True
            config.gpu_options.allow_growth = True

            self.session = tf.Session(config=config)
            self.session.run(tf.global_variables_initializer())

            load_averaged_model(self.session, sh_saver, checkpoint_path)
Ejemplo n.º 18
0
def train(log_dir, args, hparams, input_path):
    save_dir = os.path.join(log_dir, 'wave_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'wavenet_events')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    ### load check point
    checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')
    input_path = os.path.join(args.base_dir, input_path)

    log('Checkpoint_path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.wavenet_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, args.base_dir, hparams)

    # Set up model
    training_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, training_step)
    eval_model = model_test_mode(args, feeder, hparams, training_step)

    # Calculating loss and executed time
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    sh_saver = create_shadow_saver(model, training_step)
    log('wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps), end='\n==================================================================\n')

    # Memory allocation on the memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    with tf.Session(config = config) as sess:
        try:
            ###initialize variables
            sess.run(tf.global_variables_initializer())
            #### restore model from checkpoint
            if args.restore:
                try:
                    checkpoint_state=tf.train.get_checkpoint_state(save_dir)
                    if(checkpoint_state and checkpoint_state.model_checkpoint_path):
                        log('Loadding checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
                        load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training..', slack=True)
            ### start Feeder thread from session
            feeder.start_threads(sess)

            #### looping over epochs (training steps)
            while not coord.should_stop() and step< args.wavenet_train_steps:
                ###Save current time (to calculate executed time)
                start_time=time.time()
                step,y_hat,loss,opt = sess.run([training_step, model.y_hat, model.loss, model.optimize])
                #### add executed time to time window.
                time_window.append(time.time() - start_time)
                ### add loss to loss window
                loss_window.append(loss)

                #### print info to console
                message = 'Step = {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

                ###### exit if loss exploded
                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(loss, step))
                    raise Exception('Loss exploded')
                #### save checkpoint when meet checkpoint interval
                if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
                    save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams)
                    save_checkpoint(sess, sh_saver, checkpoint_path, training_step)
                ### save inference result when meed inference interval
                if step % args.eval_interval == 0:
                    log('Evaluating at step {}'.format(step))
                    eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=None,
                              hparams=model._hparams)

            log('wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True)
            return save_dir
        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            ### close data feeder object to free memory
            coord.request_stop(e)
Ejemplo n.º 19
0
def wavenet_synthesize(args, hparams, checkpoint):
    output_dir = 'wavenet_' + args.output_dir
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))
    log_dir = os.path.join(output_dir, 'plots')
    wav_dir = os.path.join(output_dir, 'wavs')
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)
    if args.model == 'Tacotron':
        raise RuntimeError(
            'Please run Tacotron synthesis from Tacotron folder, not here..')
    else:
        ### get mel file (inference result from tacotron model) from input mels_dir
        mel_files = [
            os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir)
            if f.split('.')[-1] == 'npy'
        ]
        texts = None
        ### create result folders
        os.makedirs(log_dir, exist_ok=True)
        os.makedirs(wav_dir, exist_ok=True)
        log('Starting wavenet synthesis! (this will take a while..)')
        ### split mels file (numpy matrix) to small parts due to wavenet_synthesis_batch_size in hyperparams
        mel_files = [
            mel_files[i:i + hparams.wavenet_synthesis_batch_size] for i in
            range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)
        ]
        log('debug: synthesize.py line 128')
        ### open map.txt file and write down result
        ii = 0
        iii = 0
        with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
            log('debug: synthesize.py line 131')
            #### loop over mel_files (remember that at here, mel_files are an numpy matrix)
            for i, mel_batch in enumerate(tqdm(mel_files)):
                log('debug: synthesize.py vong lap ben ngoai, ii={}'.format(
                    ii))
                ii = ii + 1
                #### load numpy matrix of mel file
                mel_spectros = [np.load(mel) for mel in mel_batch]
                log('debug: synthesize.py line 136, mel_spectros={}'.format(
                    mel_spectros))
                ### get npy file name
                basenames = [
                    os.path.basename(mel).replace('.npy', '')
                    for mel in mel_batch
                ]
                log('debug: synthesize.py line 139, basenames={}'.format(
                    basenames))
                ### generate audio and save in wav_dir
                audio_files = synth.synthesize(mel_spectros, None, basenames,
                                               wav_dir, log_dir)
                log('debug: synthesize.py line 142, audio_files={}'.format(
                    audio_files))
                speaker_logs = ['<no_g>'] * len(mel_batch)
                log('debug: synthesize.py line 144, audio_files={}'.format(
                    speaker_logs))
                ###write down result

                for j, mel_file in enumerate(mel_batch):
                    if texts is None:
                        file.write('{}|{}\n'.format(mel_file, audio_files[j],
                                                    speaker_logs[j]))
                        log('debug: synthesize.py vong lap ben trong, iii={}'.
                            format(iii))
                        iii = iii + 1
                    else:
                        file.write('{}|{}|{}\n'.format(texts[i][j], mel_file,
                                                       audio_files[j],
                                                       speaker_logs[j]))
                        log('debug: synthesize.py vong lap ben trong, iii={}'.
                            format(iii))
                        iii = iii + 1
        log('synthesized audio waveforms at {}'.format(wav_dir))
Ejemplo n.º 20
0
    def initialize(self,
                   y,
                   c,
                   g,
                   input_lengths,
                   x=None,
                   synthesis_length=None):
        '''Initialize wavenet graph for train, eval and test cases.
		'''
        hparams = self._hparams
        self.is_training = x is not None
        self.is_evaluating = not self.is_training and y is not None
        #Set all convolutions to corresponding mode
        self.set_mode(self.is_training)

        log('Initializing wavenet model.  Dimensions (? = dynamic shape): ')
        log('  Train mode:                {}'.format(self.is_training))
        log('  Eval mode:                 {}'.format(self.is_evaluating))
        log('  Synthesis mode:            {}'.format(not (
            self.is_training or self.is_evaluating)))
        with tf.variable_scope('inference') as scope:
            #Training
            log('wavenet model current mode: {}'.format(self.is_training))

            if self.is_training:
                batch_size = tf.shape(x)[0]
                #[batch_size, time_length, 1]
                self.mask = self.get_mask(
                    input_lengths,
                    maxlen=tf.shape(x)[-1])  #To be used in loss computation
                #[batch_size, channels, time_length]
                y_hat = self.step(
                    x, c, g, softmax=False
                )  #softmax is automatically computed inside softmax_cross_entropy if needed

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length, channels]
                    self.y_hat_q = tf.transpose(y_hat, [0, 2, 1])

                self.y_hat = y_hat
                self.y = y
                self.input_lengths = input_lengths

                #Add mean and scale stats if using Guassian distribution output (there would be too many logistics if using MoL)
                if self._hparams.out_channels == 2:
                    self.means = self.y_hat[:, 0, :]
                    self.log_scales = self.y_hat[:, 1, :]
                else:
                    self.means = None

                #Graph extension for log saving
                #[batch_size, time_length]
                shape_control = (batch_size, tf.shape(x)[-1], 1)
                with tf.control_dependencies(
                    [tf.assert_equal(tf.shape(y), shape_control)]):
                    y_log = tf.squeeze(y, [-1])
                    if is_mulaw_quantize(hparams.input_type):
                        self.y = y_log

                y_hat_log = tf.cond(tf.equal(tf.rank(y_hat), 4),
                                    lambda: tf.squeeze(y_hat, [-1]),
                                    lambda: y_hat)
                y_hat_log = tf.reshape(y_hat_log,
                                       [batch_size, hparams.out_channels, -1])

                if is_mulaw_quantize(hparams.input_type):
                    #[batch_size, time_length]
                    y_hat_log = tf.argmax(tf.nn.softmax(y_hat_log, axis=1), 1)

                    y_hat_log = inv_mulaw_quantize(y_hat_log,
                                                   hparams.quantize_channels)
                    y_log = inv_mulaw_quantize(y_log,
                                               hparams.quantize_channels)

                else:
                    #[batch_size, time_length]
                    if hparams.out_channels == 2:
                        y_hat_log = sample_from_gaussian(
                            y_hat_log,
                            log_scale_min_gauss=hparams.log_scale_min_gauss)
                    else:
                        y_hat_log = sample_from_discretized_mix_logistic(
                            y_hat_log, log_scale_min=hparams.log_scale_min)

                    if is_mulaw(hparams.input_type):
                        y_hat_log = inv_mulaw(y_hat_log,
                                              hparams.quantize_channels)
                        y_log = inv_mulaw(y_log, hparams.quantize_channels)

                self.y_hat_log = y_hat_log
                self.y_log = y_log

                log('  inputs:                    {}'.format(x.shape))
                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_log.shape))
                log('  outputs:                   {}'.format(y_hat_log.shape))

            #evaluating
            elif self.is_evaluating:
                #[time_length, ]
                idx = 0
                length = input_lengths[idx]
                y_target = tf.reshape(y[idx], [-1])[:length]

                if c is not None:
                    c = tf.expand_dims(c[idx, :, :length], axis=0)
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3)]):
                        c = tf.identity(c, name='eval_assert_c_rank_op')
                if g is not None:
                    g = tf.expand_dims(g[idx], axis=0)

                batch_size = tf.shape(c)[0]

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                #[channels, ]
                if is_mulaw_quantize(hparams.input_type):
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                #Fast eval
                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                #Save targets and length for eval loss computation
                if is_mulaw_quantize(hparams.input_type):
                    self.y_eval = tf.reshape(y[idx], [1, -1])[:, :length]
                else:
                    self.y_eval = tf.expand_dims(y[idx], axis=0)[:, :length, :]
                self.eval_length = length

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1), [-1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                    y_target = inv_mulaw_quantize(y_target,
                                                  hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [-1]),
                                      hparams.quantize_channels)
                    y_target = inv_mulaw(y_target, hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [-1])

                self.y_hat = y_hat
                self.y_target = y_target

                if self.local_conditioning_enabled():
                    log('  local_condition:           {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:          {}'.format(g.shape))
                log('  targets:                   {}'.format(y_target.shape))
                log('  outputs:                   {}'.format(y_hat.shape))

            #synthesizing
            else:
                batch_size = tf.shape(c)[0]
                if c is None:
                    assert synthesis_length is not None
                else:
                    #[batch_size, local_condition_time, local_condition_dimension(num_mels)]
                    message = (
                        'Expected 3 dimension shape [batch_size(1), time_length, {}] for local condition features but found {}'
                        .format(hparams.cin_channels, c.shape))
                    with tf.control_dependencies(
                        [tf.assert_equal(tf.rank(c), 3, message=message)]):
                        c = tf.identity(c, name='synthesis_assert_c_rank_op')

                    Tc = tf.shape(c)[1]
                    upsample_factor = get_hop_size(self._hparams)

                    #Overwrite length with respect to local condition features
                    synthesis_length = Tc * upsample_factor

                    #[batch_size, local_condition_dimension, local_condition_time]
                    #time_length will be corrected using the upsample network
                    c = tf.transpose(c, [0, 2, 1])

                #Start silence frame
                if is_mulaw_quantize(hparams.input_type):
                    initial_value = mulaw_quantize(0,
                                                   hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    initial_value = mulaw(0.0, hparams.quantize_channels)
                else:
                    initial_value = 0.0

                if is_mulaw_quantize(hparams.input_type):
                    assert initial_value >= 0 and initial_value < hparams.quantize_channels
                    initial_input = tf.one_hot(indices=initial_value,
                                               depth=hparams.quantize_channels,
                                               dtype=tf.float32)
                    initial_input = tf.tile(
                        tf.reshape(initial_input,
                                   [1, 1, hparams.quantize_channels]),
                        [batch_size, 1, 1])
                else:
                    initial_input = tf.ones([batch_size, 1, 1],
                                            tf.float32) * initial_value

                y_hat = self.incremental(initial_input,
                                         c=c,
                                         g=g,
                                         time_length=synthesis_length,
                                         softmax=False,
                                         quantize=True,
                                         log_scale_min=hparams.log_scale_min)

                if is_mulaw_quantize(hparams.input_type):
                    y_hat = tf.reshape(tf.argmax(y_hat, axis=1),
                                       [batch_size, -1])
                    y_hat = inv_mulaw_quantize(y_hat,
                                               hparams.quantize_channels)
                elif is_mulaw(hparams.input_type):
                    y_hat = inv_mulaw(tf.reshape(y_hat, [batch_size, -1]),
                                      hparams.quantize_channels)
                else:
                    y_hat = tf.reshape(y_hat, [batch_size, -1])

                self.y_hat = y_hat

                if self.local_conditioning_enabled():
                    log('  local_condition:            {}'.format(c.shape))
                if self.has_speaker_embedding():
                    log('  global_condition:           {}'.format(g.shape))
                log('  outputs:                    {}'.format(y_hat.shape))

        self.variables = tf.trainable_variables()
        self.ema = tf.train.ExponentialMovingAverage(
            decay=hparams.wavenet_ema_decay)
Ejemplo n.º 21
0
def run_synthesis(args, checkpoint_path, output_dir, hparams):
    '''
    generate mel spectrograms from text using trained model
    :param args: run time params
    :param checkpoint_path: path to checkpoint of pretrained model
    :param output_dir: output dir to save spectrograms (can be got from args)
    :param hparams: Hyper params
    :return:
    '''

    GTA = (args.GTA == 'True')
    if GTA:
        synth_dir = os.path.join(output_dir, 'gta')
        # Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)
    else:
        synth_dir = os.path.join(output_dir, 'natural')
        # Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)

    metadata_filename = os.path.join(args.input_dir, 'train.txt')
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams, GTA=GTA)

    ### read data from train.txt file <-- this file is generated after preprocessing
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
        log('Loaded metadata for {} examples ({:.2f} hours)'.format(
            len(metadata), hours))

    ## generate batches from metadata
    metadata = [metadata[i:i + 512] for i in range(0, len(metadata), 512)
                ]  ### fix hparams.tacotron_synthesis_batch_size = 512
    log('starting synthesis..')
    mel_dir = os.path.join(args.input_dir, 'mels')
    wav_dir = os.path.join(args.input_dir, 'audio')

    #### batch synthesizing. Need  more effort
    with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
        for i, meta in enumerate(tqdm(metadata)):
            texts = [m[5] for m in meta]
            mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
            wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
            basenames = [
                os.path.basename(m).replace('.npy', '').replace('mel-', '')
                for m in mel_filenames
            ]
            mel_output_filenames, speaker_ids = synth.synthesize(
                texts, basenames, synth_dir, None, mel_filenames)
            for elems in zip(wav_filenames, mel_filenames,
                             mel_output_filenames, speaker_ids, texts):
                file.write('|'.join([str(x) for x in elems]) + '\n')

    # with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
    #     for i, meta in enumerate(tqdm(metadata)):
    #         text = meta[5]
    #         mel_filename = os.path.join(mel_dir, meta[1])
    #         wav_filename = os.path.join(wav_dir, meta[0])
    #         mel_output_filename, speaker_id = synth.synthesize(text, i + 1, synth_dir, None, mel_filename)
    #         file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text))
    log('Predicted mel spectrograms are saved in {}'.format(synth_dir))
    return os.path.join(synth_dir, 'map.txt')
Ejemplo n.º 22
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   GTA=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
        		Initializes the model for inference

        		sets "mel_outputs" and "alignments" fields.

        		Args:
        			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        			  steps in the input time series, and values are character IDs
        			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        			of each sequence in inputs.
        			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        			of steps in the output time series, M is num_mels, and values are entries in the mel
        			spectrogram. Only needed for training.
        		"""

        ### checking for conditions
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not GTA:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not GTA and self.hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if GTA and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self.hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        ####### declare variables
        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hparams = self.hparams
            assert hparams.tacotron_teacher_forcing_mode in ('constant',
                                                             'scheduled')
            if hparams.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            ### get symbol to create embedding lookup table
            if self.hparams.hangul_type == 1:
                hangul_symbol = hangul_symbol_1
            elif self.hparams.hangul_type == 2:
                hangul_symbol = hangul_symbol_2
            elif self.hparams.hangul_type == 3:
                hangul_symbol = hangul_symbol_3
            elif self.hparams.hangul_type == 4:
                hangul_symbol = hangul_symbol_4
            else:
                hangul_symbol = hangul_symbol_5

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            # create embedding look up table with shape of [number of symbols, embedding dimension (declare in hparams]
            embedding_table = tf.get_variable(
                'inputs_embedding',
                [len(hangul_symbol), hparams.embedding_dim],
                dtype=tf.float32)
            ### inputs is a tensor of sequence of IDs  (created using text_to_sequence)
            # which is loaded through feeder class (_meta_data variable) from train.txt in training_data folder
            ## embedded_input is a Tensor with same type with embedding_table Tensor
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)
            ##########################
            ## create Encoder object##
            ##########################
            encoder_cell = Encoder.EncoderCell(
                EncoderConvolution=Encoder.EncoderConvolution(
                    is_training=is_training,
                    hparams=hparams,
                    scope='encoder_convolutions'),
                EncoderLSTM=Encoder.EncoderLSTM(is_training=is_training,
                                                size=256,
                                                zoneout=0.1,
                                                scope='encoder_lstm'))
            # extract Encoder model output
            encoder_outputs = encoder_cell(embedded_inputs,
                                           input_lengths=input_lengths)

            # store convolution output shape for visualization
            enc_conv_output_shape = encoder_cell.conv_output_shape

            ##########################
            ## create Decoder object##
            ##########################
            decoder_cell = Decoder.TacotronDecoderCell(
                prenet=Decoder.Prenet(is_training=is_training,
                                      layers_sizes=[256, 256],
                                      drop_rate=hparams.dropout_rate,
                                      scope='decoder_prenet'),
                attention_mechanism=Decoder.LocationSensitiveAttention(
                    num_units=hparams.attention_dim,
                    memory=encoder_outputs,
                    hparams=hparams,
                    mask_encoder=True,
                    memory_sequence_length=input_lengths,
                    cumulate_weights=True),
                rnn_cell=Decoder.DecoderRNN(is_training=is_training,
                                            layers=2,
                                            zoneout=hparams.zoneout_rate,
                                            scope='decoder_lstm'),
                frame_projection=Decoder.FrameProjection(
                    shape=hparams.num_mels * 2, scope='linear_transform'),
                stop_projection=Decoder.StopProjection(
                    is_training=is_training or is_evaluating,
                    shape=2,
                    scope='stop_token_projection'),
            )
            ##initiate the first state of decoder
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)
            # Define the helper for our decoder
            if is_training or is_evaluating or GTA:
                self.helper = TacoTrainingHelper(batch_size, mel_targets,
                                                 stop_token_targets, hparams,
                                                 GTA, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hparams)

            # Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 Decoder.CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=hparams.max_iters,
                 swap_memory=hparams.tacotron_swap_with_cpu)

            # Only use max iterations at synthesis time
            max_iters = hparams.max_iters if not (is_training
                                                  or is_evaluating) else None
            # Reshape outputs to be one output per entry
            # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hparams.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            # Postnet
            postnet = Postnet.Postnet(is_training,
                                      hparams=hparams,
                                      scope='postnet_convolutions')

            # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            # Project residual to same dimension as mel spectrogram
            # ==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = Decoder.FrameProjection(
                hparams.num_mels, scope='postnet_projection')
            projected_residual = residual_projection(residual)

            # Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            # time-domain waveforms is only used for predicting mels to train wavenet vocoder\
            # so we omit post processing when doing GTA synthesis
            post_condition = hparams.predict_linear and not GTA
            if post_condition:
                # Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                # Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = Encoder.EncoderCell(
                    Encoder.EncoderConvolution(
                        is_training,
                        hparams=hparams,
                        scope='post_processing_convolutions'),
                    Encoder.EncoderLSTM(is_training,
                                        size=hparams.enc_lstm_hidden_size,
                                        zoneout=hparams.zoneout_rate,
                                        scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = Decoder.FrameProjection(
                    hparams.num_freq,
                    scope='post_processing_projection')(expand_outputs)
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            # Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs

            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(GTA))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Ejemplo n.º 23
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._train_offset = 0
        self._test_offset = 0
        # Load metadata
        #load mel spectrogram numpy matrix data
        self._mel_dir = os.path.join(os.path.dirname(metadata_filename),
                                     'mels')
        #load linear spectrograme numpy matrix data
        self._linear_dir = os.path.join(os.path.dirname(metadata_filename),
                                        'linear')
        #load metadata of text which are stored in train.txt file
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|')
                              for line in f]  ### major variable
            ##calculate total audio length (for logging information)
            #calculate length in milisecond per hop_size
            frame_shift_ms = hparams.hop_size / hparams.sample_rate
            #calculate length in hour by getting 4th variable in train.txt
            hours = sum([int(x[4])
                         for x in self._metadata]) * frame_shift_ms / (3600)
            log('Loaded metadata for {} examples ({:.2f} hours)'.format(
                len(self._metadata), hours))

        # Train test split
        ## training dataset: _train_meta
        ## test dataset: _test_meta
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches is not None
        test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size
                     is not None else hparams.tacotron_test_batches *
                     hparams.tacotron_batch_size)
        indices = np.arange(len(
            self._metadata))  # create a integer array from 0 to len(metadata)
        # indicate train index and test index from above array
        train_indices, test_indices = train_test_split(
            indices,
            test_size=test_size,
            random_state=hparams.tacotron_data_random_state)
        # Make sure test_indices is a multiple of batch_size else round up
        len_test_indices = self._round_up(len(test_indices),
                                          hparams.tacotron_batch_size)
        print('len test indices {}'.format(len_test_indices))
        # redundant test_indices
        extra_test = test_indices[len_test_indices:]
        # new test_indices based on new length
        test_indices = test_indices[:len_test_indices]
        # new train_indices by joining old one with redundant test_indices
        train_indices = np.concatenate([train_indices, extra_test])
        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])
        self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches == self.test_steps

        # pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        # explicitely setting the padding to a value that doesn't originally exist in the spectogram
        # to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -(hparams.max_abs_value + .1)
        else:
            self._target_pad = -0.1
        # Mark finished sequences with 1s
        self._token_pad = 1.

        with tf.device('/cpu:0'):
            # Create placeholders for inputs and targets. Don't specify batch size because we want
            # to be able to feed different batch sizes at eval time.
            self._placeholders = [
                tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
                tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_mels),
                               name='mel_targets'),
                tf.placeholder(tf.float32,
                               shape=(None, None),
                               name='token_targets'),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_freq),
                               name='linear_targets'),
                tf.placeholder(tf.int32,
                               shape=(None, ),
                               name='targets_lengths'),
            ]

            # Create queue for buffering data
            queue = tf.FIFOQueue(8, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.float32,
                tf.int32
            ],
                                 name='input_queue')
            self._enqueue_op = queue.enqueue(self._placeholders)
            self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue(
            )
            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.mel_targets.set_shape(self._placeholders[2].shape)
            self.token_targets.set_shape(self._placeholders[3].shape)
            self.linear_targets.set_shape(self._placeholders[4].shape)
            self.targets_lengths.set_shape(self._placeholders[5].shape)

            # Create eval queue for buffering eval data
            eval_queue = tf.FIFOQueue(1, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.float32,
                tf.int32
            ],
                                      name='eval_queue')
            self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
            self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
            self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue()
            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_mel_targets.set_shape(self._placeholders[2].shape)
            self.eval_token_targets.set_shape(self._placeholders[3].shape)
            self.eval_linear_targets.set_shape(self._placeholders[4].shape)
            self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
Ejemplo n.º 24
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   GTA=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None,
                   reference_mel=None):
        """
                Initializes the model for inference

                sets "mel_outputs" and "alignments" fields.

                Args:
                    - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
                      steps in the input time series, and values are character IDs
                    - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
                    of each sequence in inputs.
                    - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
                    of steps in the output time series, M is num_mels, and values are entries in the mel
                    spectrogram. Only needed for training.
                """

        ### checking for conditions
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not GTA:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not GTA and self.hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if GTA and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self.hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        ### get symbol to create embedding lookup table
        if self.hparams.hangul_type == 1:
            hangul_symbol = hangul_symbol_1
        elif self.hparams.hangul_type == 2:
            hangul_symbol = hangul_symbol_2
        elif self.hparams.hangul_type == 3:
            hangul_symbol = hangul_symbol_3
        elif self.hparams.hangul_type == 4:
            hangul_symbol = hangul_symbol_4
        else:
            hangul_symbol = hangul_symbol_5

        split_device = '/cpu:0'
        with tf.device(split_device):
            hp = self.hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []
            ##todo:
            tower_ref_audio = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))
                if is_training:
                    ## if training, add mel_targets as ref_audio
                    tower_ref_audio.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)
        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []
        # tower_ref_encoder = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    ## reference embedding:
                    ##if there are input reference audio
                    # if tower_ref_audio[i] is not None:
                    # 	print(tower_ref_audio[i])
                    # 	ref_encoder = reference_encoder(inputs=tower_ref_audio[i], is_training=is_training)
                    # 	self.tower_ref_output.append(ref_encoder)
                    # 	log('Tacotron.py line 180 ref_encoder.shape: {}'.format(ref_encoder.shape))
                    # 	ref_attention = Attention.MultiheadsAttention(
                    # 		query=tf.expand_dims(ref_encoder, axis=1),
                    # 		value=tf.tanh(tf.tile(tf.expand_dims(self.style_tokens, axis=0), [batch_size,1,1])),
                    # 		attention_heads=self.hparams.attention_heads,
                    # 		num_units=128,
                    # 		normalize=True
                    # 	)
                    # 	style_embedding = ref_attention.multi_heads_attention()
                    # 	log('Tacotron.py line 188 style_embedding.shape: {}'.format(style_embedding.shape))
                    # else: ### if there is not input reference audio, use random
                    # 	rand_weight = tf.nn.softmax(tf.random_uniform([hp.attention_heads, hp.num_tokens], maxval=1.0, dtype=tf.float32),name='random_weight_gst')
                    # 	style_embedding = tf.reshape(tf.matmul(rand_weight,tf.nn.tanh(self.style_tokens), [1,1]+[hp.attention_heads+self.style_tokens.get_shape().as_list()[1]]))
                    #
                    # 	log('Tacotron.py line 193 style_embedding.shape: {}'.format(style_embedding.shape))

                    if hp.use_gst:
                        # Global style tokens (GST)
                        gst_tokens = tf.get_variable(
                            'style_tokens',
                            [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                            dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(
                                stddev=0.5))
                        self.gst_tokens = gst_tokens

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding',
                        [len(hangul_symbol), hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    self.embedded_inputs_ = embedded_inputs
                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = Encoder.EncoderCell(
                        Encoder.EncoderConvolution(
                            is_training,
                            hparams=hp,
                            scope='encoder_convolutions'),
                        Encoder.EncoderLSTM(is_training,
                                            size=hp.enc_lstm_hidden_size,
                                            zoneout=hp.zoneout_rate,
                                            scope='encoder_LSTM'))
                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])
                    # # ### append reference embedding to encoder output
                    if is_training:
                        ### on training,   reference_mel is None, set it to target mel
                        reference_mel = mel_targets
                    if reference_mel is not None:
                        # Reference encoder
                        refnet_outputs = reference_encoder(
                            reference_mel,
                            filters=hp.reference_filters,
                            kernel_size=(3, 3),
                            strides=(2, 2),
                            encoder_cell=GRUCell(hp.reference_depth),
                            is_training=is_training)  # [N, 128]
                        self.refnet_outputs = refnet_outputs
                        if hp.use_gst:
                            style_attention = Attention.MultiheadAttention(
                                query=tf.expand_dims(refnet_outputs,
                                                     axis=1),  # [N, 1, 128]
                                value=tf.tanh(
                                    tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                            [batch_size, 1, 1])),
                                # [N, hp.num_gst, 256/hp.num_heads]
                                num_heads=hp.num_heads,
                                num_units=hp.style_att_dim,
                                attention_type=hp.style_att_type)
                            style_embeddings = style_attention.multi_head_attention(
                            )  # [N, 1, 256]
                        else:
                            style_embeddings = tf.expand_dims(
                                refnet_outputs, axis=1)  # [N, 1, 128]
                        style_embeddings = tf.tile(
                            style_embeddings,
                            [1, shape_list(encoder_outputs)[1], 1
                             ])  # [N, T_in, 128]
                    else:
                        print("Use random weight for GST.")
                        random_weights = tf.random_uniform(
                            [hp.num_heads, hp.num_gst],
                            maxval=1.0,
                            dtype=tf.float32)
                        random_weights = tf.nn.softmax(random_weights,
                                                       name="random_weights")
                        style_embeddings = tf.matmul(random_weights,
                                                     tf.nn.tanh(gst_tokens))
                        style_embeddings = tf.reshape(
                            style_embeddings, [1, 1] + [
                                hp.num_heads *
                                gst_tokens.get_shape().as_list()[1]
                            ])
                        style_embeddings = tf.tile(style_embeddings, [
                            shape_list(encoder_outputs)[0],
                            shape_list(encoder_outputs)[1], 1
                        ])  # [N, T_in, 128]
                    encoder_outputs = tf.concat(
                        [encoder_outputs, style_embeddings], axis=-1)
                    self.encoder_outputs = encoder_outputs
                    print('encoder_outputs.shape after {}'.format(
                        encoder_outputs.shape))

                    # For shape visualization purpose
                    enc_conv_output_shape = self.encoder_outputs.shape
                    # Decoder Parts
                    # Attention Decoder Prenet
                    prenet = Decoder.Prenet(is_training,
                                            layers_sizes=hp.prenet_layers,
                                            drop_rate=hp.dropout_rate,
                                            scope='decoder_prenet')

                    print('memory.shape {}'.format(encoder_outputs.shape))
                    attention_mechanism = Attention.LocationSensitiveAttention(
                        num_units=hp.attention_dim,
                        memory=encoder_outputs,
                        hparams=hp,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    # Decoder LSTM Cells
                    decoder_lstm = Decoder.DecoderRNN(
                        is_training=is_training,
                        layers=2,
                        size=hp.decoder_lstm_units,
                        zoneout=hp.zoneout_rate,
                        scope='decoder_LSTM')
                    # Frames Projection layer
                    frame_projection = Decoder.FrameProjection(
                        shape=hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    # <stop_token> projection layer
                    stop_projection = Decoder.StopProjection(
                        is_training,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = Decoder.TacotronDecoderCell(
                        prenet, attention_mechanism, decoder_lstm,
                        frame_projection, stop_projection)
                    # Define the helper for our decoder
                    if is_training or is_evaluating or GTA:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, GTA,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    # initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    # Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    # Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         Decoder.CustomDecoder(decoder_cell, self.helper,
                                               decoder_init_state),
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)
                    # Reshape outputs to be one output per entry
                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])
                    if hp.clip_outputs:
                        decoder_output = tf.minimum(
                            tf.maximum(
                                decoder_output,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    # Postnet
                    postnet = Postnet.Postnet(is_training,
                                              hparams=hp,
                                              scope='postnet_convolutions')

                    # Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    print('decoder_output.shape {}'.format(
                        decoder_output.shape))

                    residual = postnet(decoder_output)
                    print('residual.shape {}'.format(residual.shape))

                    # Project residual to same dimension as mel spectrogram
                    # ==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = Decoder.FrameProjection(
                        hp.num_mels, scope='postnet_projection')

                    projected_residual = residual_projection(residual)

                    print('projected_residual.shape {}'.format(
                        projected_residual.shape))

                    # Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual
                    print('mel_outputs.shape {}'.format(mel_outputs.shape))

                    if hp.clip_outputs:
                        mel_outputs = tf.minimum(
                            tf.maximum(
                                mel_outputs,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    # Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)
            log('initialization done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets
        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))