Esempio n. 1
0
    def _enqueue_next_train_group(self):
        while not self._coord.should_stop():
            start = time.time()

            # Read a group of examples
            n = self._hparams.tacotron_batch_size
            r = self._hparams.outputs_per_step
            examples = [
                self._get_next_example() for i in range(n * _batches_per_group)
            ]

            # Bucket examples based on similar output sequence length for efficiency
            examples.sort(key=lambda x: x[-1])
            batches = [examples[i:i + n] for i in range(0, len(examples), n)]
            np.random.shuffle(batches)

            log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(
                len(batches), n,
                time.time() - start))
            for batch in batches:
                feed_dict = dict(
                    zip(self._placeholders, self._prepare_batch(batch, r)))
                if not self._coord.should_stop():
                    #log("enque op started (train).")
                    self._session.run(self._enqueue_op, feed_dict=feed_dict)
                    #log("enque op finished (train).")
        log("_enqueue_next_train_group finished.")
def run_synthesis(args, checkpoint, hparams):

  try:
    checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
    log('loaded model at {}'.format(checkpoint_path))
  except:
    raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

  if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
    raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format(
      hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

  if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
    raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

  gta = args.GTA == 'True'
  synth_dir = get_synth_dir(args.caching_dir, gta)
  gta_map_file = get_gta_map_file(synth_dir)
  #Create output path if it doesn't exist
  os.makedirs(synth_dir, exist_ok=True)

  metadata_path = get_train_txt(args.caching_dir)
  metadata = load_meta(metadata_path)
  log(hparams_debug_string())
  synth = Synthesizer(args.caching_dir)
  synth.load(checkpoint_path, hparams, gta=gta)
  frame_shift_ms = hparams.hop_size / hparams.sample_rate
  hours = sum([int(x[2]) for x in metadata]) * frame_shift_ms / (3600)
  log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))

  #Set inputs batch wise
  metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]

  log('Starting Synthesis')

  txt_dir = get_txt_dir(args.caching_dir)
  mel_dir = get_mel_dir(args.caching_dir)
  wav_dir = get_wav_dir(args.caching_dir)

  symbol_file = get_symbols_file(args.caching_dir)
  conv = get_from_file(symbol_file)
  with open(gta_map_file, 'w') as file:
    for i, meta in enumerate(tqdm(metadata)):
      if i % 10 == 0:
        text_paths = [os.path.join(txt_dir, "{}.npy".format(m[0])) for m in meta]
        text_symbols = [np.load(pth) for pth in text_paths]
        # trim ~ at the end
        texts = [conv.sequence_to_original_text(x) for x in text_symbols]
        #texts = [m[5] for m in meta]
        mel_filenames = [os.path.join(mel_dir, "{}.npy".format(m[0])) for m in meta]
        wav_filenames = [os.path.join(wav_dir, "{}.npy".format(m[0])) for m in meta]
        basenames = [m[0] for m in meta]
        mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)

        for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
          file.write('|'.join([str(x) for x in elems]) + '\n')

  log('synthesized mel spectrograms at {}'.format(synth_dir))
  return gta_map_file
Esempio n. 3
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caching_dir',
                        default='/datasets/models/tacotron/cache')
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyperparameter overrides as a comma-separated list of name=value pairs'
    )
    accepted_modes = ['eval', 'synthesis', 'live']
    parser.add_argument(
        '--mode',
        default='eval',
        help='mode of run: can be one of {}'.format(accepted_modes))
    parser.add_argument(
        '--GTA',
        default='True',
        help=
        'Ground truth aligned synthesis, defaults to True, only considered in synthesis mode'
    )
    parser.add_argument(
        '--text_list',
        default='',
        help=
        'Text file contains list of texts to be synthesized. Valid if mode=eval'
    )
    parser.add_argument(
        '--speaker_id',
        default=None,
        help=
        'Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids'
    )
    args = parser.parse_args()

    if args.mode not in accepted_modes:
        raise ValueError('accepted modes are: {}, found {}'.format(
            accepted_modes, args.mode))

    if args.GTA not in ('True', 'False'):
        raise ValueError('GTA option must be either True or False')

    modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    from src.tac.training.wav_training import get_log_dir
    from src.tac.training.wav_training import get_save_dir
    wavenet_log_dir = get_log_dir(args.caching_dir)
    wavenet_pretrained = get_save_dir(wavenet_log_dir)

    log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
    run_synthesis(args, wavenet_pretrained, args.caching_dir, hparams)
    log('Tacotron-2 TTS synthesis complete!')
Esempio n. 4
0
def run_eval(args, checkpoint, hparams, sentences):
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    output_dir = get_synthesis_output_dir(args.caching_dir)

    eval_dir = get_evals_dir(args.caching_dir)
    log_dir = os.path.join(output_dir, 'logs-eval')

    #if args.model == 'Tacotron-2':
    #assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)

    log(hparams_debug_string())
    synth = Synthesizer(args.caching_dir)
    synth.load(checkpoint_path, hparams)

    #Set inputs batch wise
    sentences = [
        sentences[i:i + hparams.tacotron_synthesis_batch_size] for i in range(
            0, len(sentences), hparams.tacotron_synthesis_batch_size)
    ]

    log('Starting Synthesis')
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, texts in enumerate(tqdm(sentences)):
            start = time.time()
            basenames = [
                'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))
            ]
            mel_filenames, speaker_ids = synth.synthesize(
                texts, basenames, eval_dir, log_dir, None)

            for elems in zip(texts, mel_filenames, speaker_ids):
                file.write('|'.join([str(x) for x in elems]) + '\n')
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Esempio n. 5
0
    def make_test_batches(self):
        start = time.time()

        #Read one example for evaluation
        n = 1

        #Test on entire test set (one sample at an evaluation step)
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches
Esempio n. 6
0
    def load(self, checkpoint_path, hparams):
        log('Constructing model: {}'.format('WaveNet'))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=(None, None, hparams.num_mels),
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(None, 1),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None
        self.targets = tf.placeholder(
            tf.float32, shape=(1, None, 1), name='audio_targets'
        ) if hparams.wavenet_synth_debug else None  #Debug only with 1 wav
        self.input_lengths = tf.placeholder(
            tf.int32, shape=(1, ),
            name='input_lengths') if hparams.wavenet_synth_debug else None
        self.synth_debug = hparams.wavenet_synth_debug

        with tf.variable_scope('WaveNet_model') as scope:
            self.model = create_model(hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=self.input_lengths,
                                  synthesis_length=self.synthesis_length,
                                  test_inputs=self.targets)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            #Memory allocation on the GPU as needed
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.allow_soft_placement = True

            self.session = tf.Session(config=config)
            self.session.run(tf.global_variables_initializer())

        load_averaged_model(self.session, sh_saver, checkpoint_path)
Esempio n. 7
0
    def make_test_batches(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.tacotron_batch_size
        r = self._hparams.outputs_per_step

        #Test on entire test set
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]

        # Bucket examples based on similar output sequence length for efficiency
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches, r
def run_live(args, checkpoint, hparams):
    # if args.mode != eval or synthesis
    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
        raise ValueError(
            'Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
        raise ValueError(
            'Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'
            .format(hparams.tacotron_synthesis_batch_size,
                    hparams.tacotron_num_gpus))

    #Log to Terminal without keeping any records in files
    log(hparams_debug_string())
    synth = Synthesizer(args.caching_dir)
    synth.load(checkpoint_path, hparams)

    #Generate fast greeting message
    greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
    log(greetings)
    generate_fast(synth, greetings)

    #Interaction loop
    while True:
        try:
            text = input()
            generate_fast(synth, text)

        except KeyboardInterrupt:
            leave = 'Thank you for testing our features. see you soon.'
            log(leave)
            generate_fast(synth, leave)
            sleep(2)
            break
Esempio n. 9
0
 def _enqueue_next_test_group(self):
     #Create test batches once and evaluate on them for all test steps
     test_batches, r = self.make_test_batches()
     while not self._coord.should_stop():
         for batch in test_batches:
             feed_dict = dict(
                 zip(self._placeholders, self._prepare_batch(batch, r)))
             if not self._coord.should_stop():
                 log("enque op started (test).")
                 self._session.run(self._eval_enqueue_op,
                                   feed_dict=feed_dict)
                 log("enque op finished (test).")
     log("_enqueue_next_test_group finished.")
def run():
  parser = argparse.ArgumentParser()
  parser.add_argument('--caching_dir', default='/datasets/models/tacotron/cache')
  parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training')
  parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode')
  parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
  parser.add_argument('--hparams', default='', help='Hyperparameter overrides as a comma-separated list of name=value pairs')

  args = parser.parse_args()
  modified_hp = hparams.parse(args.hparams)
  os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
  log_dir = get_log_dir(args.caching_dir)
  os.makedirs(log_dir, exist_ok=True)
  infolog_path = get_infolog_path(log_dir)
  infolog.init(infolog_path, 'tacotron')

  log('\n##########################################################\n')
  log('Tacotron GTA Synthesis\n')
  log('###########################################################\n')
  tacotron_pretrained = get_save_dir(log_dir)
  run_synthesis(args, tacotron_pretrained, modified_hp)
Esempio n. 11
0
def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
    log('\nSaving intermediate states at step {}'.format(global_step))
    idx = 0
    y_hat, y, loss, length, input_mel, upsampled_features = sess.run([
        model.tower_y_hat_log[0][idx], model.tower_y_log[0][idx], model.loss,
        model.tower_input_lengths[0][idx], model.tower_c[0][idx],
        model.tower_upsampled_local_features[0][idx]
    ])

    #mask by length
    y_hat[length:] = 0
    y[length:] = 0

    #Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir,
                                 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir,
                                   'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir,
                             'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(
        plot_dir,
        'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(
        plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    #Save figure
    util.waveplot(plot_path,
                  y_hat,
                  y,
                  hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(
                      model_name, time_string(), global_step, loss))

    #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(
        generated_mel,
        mel_path,
        title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'
        .format(global_step, loss),
        target_spectrogram=input_mel.T)
    util.plot_spectrogram(
        upsampled_features.T,
        upsampled_path,
        title='Upsampled Local Condition features, step={}, loss={:.5f}'.
        format(global_step, loss),
        auto_aspect=True)

    #Save audio
    save_wavenet_wav(y_hat,
                     pred_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y,
                     target_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
Esempio n. 12
0
    def __init__(self, coordinator, caching_dir, hparams):
        super(Feeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._train_offset = 0
        self._test_offset = 0

        # Load metadata

        self._txt_dir = get_txt_dir(caching_dir)
        self._mel_dir = get_mel_dir(caching_dir)
        self._linear_dir = get_lin_dir(caching_dir)
        metadata_path = get_train_txt(caching_dir)
        self._metadata = load_meta(metadata_path)

        #self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
        #self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
        # with open(metadata_filename, encoding='utf-8') as f:
        # 	self._metadata = [line.strip().split('|') for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[2])
                     for x in self._metadata]) * frame_shift_ms / (3600)
        log('Loaded metadata for {} examples ({:.2f} hours)'.format(
            len(self._metadata), hours))

        #Train test split
        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches is not None

        test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size
                     is not None else hparams.tacotron_test_batches *
                     hparams.tacotron_batch_size)
        indices = np.arange(len(self._metadata))
        train_indices, test_indices = train_test_split(
            indices,
            test_size=test_size,
            random_state=hparams.tacotron_data_random_state)

        #Make sure test_indices is a multiple of batch_size else round down
        len_test_indices = self._round_down(len(test_indices),
                                            hparams.tacotron_batch_size)
        extra_test = test_indices[len_test_indices:]
        test_indices = test_indices[:len_test_indices]
        train_indices = np.concatenate([train_indices, extra_test])

        self._train_meta = list(np.array(self._metadata)[train_indices])
        self._test_meta = list(np.array(self._metadata)[test_indices])

        self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
        log('test_steps = {}'.format(self.test_steps))

        if hparams.tacotron_test_size is None:
            assert hparams.tacotron_test_batches == self.test_steps

        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn't originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.
        #Mark finished sequences with 1s
        self._token_pad = 1.

        with tf.device('/cpu:0'):
            # Create placeholders for inputs and targets. Don't specify batch size because we want
            # to be able to feed different batch sizes at eval time.
            self._placeholders = [
                tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
                tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_mels),
                               name='mel_targets'),
                tf.placeholder(tf.float32,
                               shape=(None, None),
                               name='token_targets'),
                tf.placeholder(tf.float32,
                               shape=(None, None, hparams.num_freq),
                               name='linear_targets'),
                tf.placeholder(tf.int32,
                               shape=(None, ),
                               name='targets_lengths'),
                tf.placeholder(tf.int32,
                               shape=(hparams.tacotron_num_gpus, None),
                               name='split_infos'),
            ]

            # Create queue for buffering data
            self.input_queue = tf.FIFOQueue(8, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.float32,
                tf.int32, tf.int32
            ],
                                            name='input_queue')
            self._enqueue_op = self.input_queue.enqueue(self._placeholders)
            log("dequeue input_queuue")
            self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = self.input_queue.dequeue(
            )

            self.inputs.set_shape(self._placeholders[0].shape)
            self.input_lengths.set_shape(self._placeholders[1].shape)
            self.mel_targets.set_shape(self._placeholders[2].shape)
            self.token_targets.set_shape(self._placeholders[3].shape)
            self.linear_targets.set_shape(self._placeholders[4].shape)
            self.targets_lengths.set_shape(self._placeholders[5].shape)
            self.split_infos.set_shape(self._placeholders[6].shape)

            # Create eval queue for buffering eval data
            self.eval_queue = tf.FIFOQueue(1, [
                tf.int32, tf.int32, tf.float32, tf.float32, tf.float32,
                tf.int32, tf.int32
            ],
                                           name='eval_queue')
            #todo here is a bug with session
            self._eval_enqueue_op = self.eval_queue.enqueue(self._placeholders)
            log("dequeue eval")
            self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
             self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = self.eval_queue.dequeue()

            self.eval_inputs.set_shape(self._placeholders[0].shape)
            self.eval_input_lengths.set_shape(self._placeholders[1].shape)
            self.eval_mel_targets.set_shape(self._placeholders[2].shape)
            self.eval_token_targets.set_shape(self._placeholders[3].shape)
            self.eval_linear_targets.set_shape(self._placeholders[4].shape)
            self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
            self.eval_split_infos.set_shape(self._placeholders[6].shape)
Esempio n. 13
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   symbols_count: int,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False,
                   split_infos=None):
        """
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no multi targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(
                input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
            tower_targets_lengths = tf.split(
                targets_lengths,
                num_or_size_splits=hp.tacotron_num_gpus,
                axis=0) if targets_lengths is not None else targets_lengths

            p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]],
                                  lout_int)
            p_mel_targets = tf.py_func(
                split_func, [mel_targets, split_infos[:, 1]],
                lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.py_func(
                split_func, [stop_token_targets, split_infos[:, 2]], lout_float
            ) if stop_token_targets is not None else stop_token_targets
            p_linear_targets = tf.py_func(
                split_func, [linear_targets, split_infos[:, 3]],
                lout_float) if linear_targets is not None else linear_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []
            tower_linear_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            linear_channels = hp.num_freq
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i],
                                   [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
                if p_linear_targets is not None:
                    tower_linear_targets.append(
                        tf.reshape(p_linear_targets[i],
                                   [batch_size, -1, linear_channels]))

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []
        self.tower_linear_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(
                    tf.train.replica_device_setter(ps_tasks=1,
                                                   ps_device="/cpu:0",
                                                   worker_device=gpus[i])):
                with tf.variable_scope('inference') as scope:
                    assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                                'scheduled')
                    if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                        assert global_step is not None

                    #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.get_variable(
                        'inputs_embedding', [symbols_count, hp.embedding_dim],
                        dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(
                        self.embedding_table, tower_inputs[i])

                    #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training,
                                            hparams=hp,
                                            scope='encoder_convolutions'),
                        EncoderRNN(is_training,
                                   size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate,
                                   scope='encoder_LSTM'))

                    encoder_outputs = encoder_cell(embedded_inputs,
                                                   tower_input_lengths[i])

                    #For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape

                    #Decoder Parts
                    #Attention Decoder Prenet
                    prenet = Prenet(is_training,
                                    layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate,
                                    scope='decoder_prenet')
                    #Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(
                        hp.attention_dim,
                        encoder_outputs,
                        hparams=hp,
                        is_training=is_training,
                        mask_encoder=hp.mask_encoder,
                        memory_sequence_length=tf.reshape(
                            tower_input_lengths[i], [-1]),
                        smoothing=hp.smoothing,
                        cumulate_weights=hp.cumulative_weights)
                    #Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training,
                                              layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope='decoder_LSTM')
                    #Frames Projection layer
                    frame_projection = FrameProjection(
                        hp.num_mels * hp.outputs_per_step,
                        scope='linear_transform_projection')
                    #<stop_token> projection layer
                    stop_projection = StopProjection(
                        is_training or is_evaluating,
                        shape=hp.outputs_per_step,
                        scope='stop_token_projection')

                    #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(prenet,
                                                       attention_mechanism,
                                                       decoder_lstm,
                                                       frame_projection,
                                                       stop_projection)

                    #Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(
                            batch_size, tower_mel_targets[i], hp, gta,
                            is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    #initial decoder state
                    decoder_init_state = decoder_cell.zero_state(
                        batch_size=batch_size, dtype=tf.float32)

                    #Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (
                        is_training or is_evaluating) else None

                    #Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                         CustomDecoder(decoder_cell, self.helper,
                                       decoder_init_state),
                         impute_finished=False,
                         maximum_iterations=max_iters,
                         swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction,
                                                [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction,
                                                       [batch_size, -1])

                    if hp.clip_outputs:
                        decoder_output = tf.minimum(
                            tf.maximum(
                                decoder_output,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    #Postnet
                    postnet = Postnet(is_training,
                                      hparams=hp,
                                      scope='postnet_convolutions')

                    #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
                    residual = postnet(decoder_output)

                    #Project residual to same dimension as mel spectrogram
                    #==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(
                        hp.num_mels, scope='postnet_projection')
                    projected_residual = residual_projection(residual)

                    #Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if hp.clip_outputs:
                        mel_outputs = tf.minimum(
                            tf.maximum(
                                mel_outputs,
                                T2_output_range[0] - hp.lower_bound_decay),
                            T2_output_range[1])

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels,
                                         hp.cbhg_conv_channels,
                                         hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size,
                                         hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units,
                                         hp.cbhg_rnn_units,
                                         hp.batch_norm_position,
                                         is_training,
                                         name='CBHG_postnet')

                        #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        #Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(
                            hp.num_freq, scope='cbhg_linear_specs_projection')

                        #[batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                        if hp.clip_outputs:
                            linear_outputs = tf.minimum(
                                tf.maximum(
                                    linear_outputs,
                                    T2_output_range[0] - hp.lower_bound_decay),
                                T2_output_range[1])

                    #Grab alignments from the final decoder state
                    alignments = tf.transpose(
                        final_decoder_state.alignment_history.stack(),
                        [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(
                        stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_outputs.append(encoder_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log('initialisation done {}'.format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.trainable_variables()

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        for i in range(hp.tacotron_num_gpus):
            log('  device:                   {}'.format(i))
            log('  embedding:                {}'.format(
                tower_embedded_inputs[i].shape))
            log('  enc conv out:             {}'.format(
                tower_enc_conv_output_shape[i]))
            log('  encoder out:              {}'.format(
                tower_encoder_outputs[i].shape))
            log('  decoder out:              {}'.format(
                self.tower_decoder_output[i].shape))
            log('  residual out:             {}'.format(
                tower_residual[i].shape))
            log('  projected residual out:   {}'.format(
                tower_projected_residual[i].shape))
            log('  mel out:                  {}'.format(
                self.tower_mel_outputs[i].shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    self.tower_linear_outputs[i].shape))
            log('  <stop_token> out:         {}'.format(
                self.tower_stop_token_prediction[i].shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Esempio n. 14
0
    def load(self, checkpoint_path, hparams, gta=False):
        log('Constructing model: Tacotron')
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
        input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
        targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels),
                                 name='mel_targets')
        split_infos = tf.placeholder(tf.int32,
                                     shape=(hparams.tacotron_num_gpus, None),
                                     name='split_infos')
        with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
            self.model = Tacotron(hparams)
            symbols_count = self._symbol_converter.get_symbols_count()
            if gta:
                self.model.initialize(inputs,
                                      input_lengths,
                                      symbols_count,
                                      targets,
                                      gta=gta,
                                      split_infos=split_infos)
            else:
                self.model.initialize(inputs,
                                      input_lengths,
                                      symbols_count,
                                      split_infos=split_infos)

            self.mel_outputs = self.model.tower_mel_outputs
            self.linear_outputs = self.model.tower_linear_outputs if (
                hparams.predict_linear and not gta) else None
            self.alignments = self.model.tower_alignments
            self.stop_token_prediction = self.model.tower_stop_token_prediction
            self.charsets = targets

        if hparams.GL_on_GPU:
            self.GLGPU_mel_inputs = tf.placeholder(tf.float32,
                                                   (None, hparams.num_mels),
                                                   name='GLGPU_mel_inputs')
            self.GLGPU_lin_inputs = tf.placeholder(tf.float32,
                                                   (None, hparams.num_freq),
                                                   name='GLGPU_lin_inputs')

            self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
                self.GLGPU_mel_inputs, hparams)
            self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
                self.GLGPU_lin_inputs, hparams)

        self.gta = gta
        self._hparams = hparams
        #pad input sequences with the <pad_token> 0 ( _ )
        self._pad = 0
        #explicitely setting the padding to a value that doesn't originally exist in the spectogram
        #to avoid any possible conflicts, without affecting the output range of the model too much
        if hparams.symmetric_mels:
            self._target_pad = -hparams.max_abs_value
        else:
            self._target_pad = 0.

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.targets = targets
        self.split_infos = split_infos

        log('Loading checkpoint: %s' % checkpoint_path)
        #Memory allocation on the GPUs as needed
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        self.session = tf.Session(config=config)
        self.session.run(tf.global_variables_initializer())

        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)
Esempio n. 15
0
def run(testrun: bool = False):
    import argparse
    parser = argparse.ArgumentParser()

    train_steps = 5000
    checkpoint_intervall = 100

    if testrun:
        train_steps = 20
        checkpoint_intervall = 1

    parser.add_argument('--caching_dir',
                        default='/datasets/models/tacotron/cache')
    parser.add_argument(
        '--GTA',
        default='True',
        help=
        'Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode'
    )
    parser.add_argument('--tf_log_level',
                        type=int,
                        default=1,
                        help='Tensorflow C++ log level.')
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyperparameter overrides as a comma-separated list of name=value pairs'
    )
    parser.add_argument('--restore',
                        type=bool,
                        default=False,
                        help='Set this to False to do a fresh training')
    parser.add_argument('--checkpoint_interval',
                        type=int,
                        default=checkpoint_intervall,
                        help='Steps between writing checkpoints')  # 2500
    parser.add_argument('--eval_interval',
                        type=int,
                        default=100000,
                        help='Steps between eval on test data')
    parser.add_argument('--summary_interval',
                        type=int,
                        default=10000,
                        help='Steps between running summary ops')
    parser.add_argument(
        '--embedding_interval',
        type=int,
        default=10000,
        help='Steps between updating embeddings projection visualization')
    parser.add_argument('--wavenet_train_steps',
                        type=int,
                        default=train_steps,
                        help='total number of wavenet training steps')

    args = parser.parse_args()
    modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
    log_dir = get_log_dir(args.caching_dir)
    os.makedirs(log_dir, exist_ok=True)
    infolog_path = get_infolog_path(log_dir)
    infolog.init(infolog_path, 'tacotron')

    log('\n##########################################################\n')
    log('Wavenet Train\n')
    log('###########################################################\n')

    train(log_dir, args, modified_hp)
Esempio n. 16
0
def run_synthesis(args, checkpoint, caching_dir, hparams):
    output_dir = get_output_dir(caching_dir)

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except:
        raise RuntimeError(
            'Failed to load checkpoint at {}'.format(checkpoint))

    log_dir = os.path.join(output_dir, 'plots')
    wav_dir = os.path.join(output_dir, 'wavs')

    #We suppose user will provide correct folder depending on training method
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #if args.model == 'Tacotron-2':
    #If running all Tacotron-2, synthesize audio from evaluated mels
    evals_dir = get_evals_dir(args.caching_dir)
    metadata_filename = os.path.join(evals_dir, 'map.txt')

    with open(metadata_filename, encoding='utf-8') as f:
        metadata = np.array([line.strip().split('|') for line in f])

    speaker_ids = metadata[:, 2]
    mel_files = metadata[:, 1]
    texts = metadata[:, 0]

    speaker_ids = None if (speaker_ids == '<no_g>').all() else speaker_ids
    # else:
    #   #else Get all npy files in input_dir (supposing they are mels)
    #   mel_files  = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy'])
    #   speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',')
    #   if speaker_ids is not None:
    #     assert len(speaker_ids) == len(mel_files)

    #  texts = None

    log('Starting synthesis! (this will take a while..)')
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    mel_files = [
        mel_files[i:i + hparams.wavenet_synthesis_batch_size]
        for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)
    ]
    speaker_ids = None if speaker_ids is None else [
        speaker_ids[i:i + hparams.wavenet_synthesis_batch_size] for i in range(
            0, len(speaker_ids), hparams.wavenet_synthesis_batch_size)
    ]
    texts = None if texts is None else [
        texts[i:i + hparams.wavenet_synthesis_batch_size]
        for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size)
    ]

    with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
        for i, mel_batch in enumerate(tqdm(mel_files)):
            mel_spectros = [np.load(mel_file) for mel_file in mel_batch]

            basenames = [
                os.path.basename(mel_file).replace('.npy', '')
                for mel_file in mel_batch
            ]
            speaker_id_batch = None if speaker_ids is None else speaker_ids[i]
            audio_files = synth.synthesize(mel_spectros, speaker_id_batch,
                                           basenames, wav_dir, log_dir)

            speaker_logs = ['<no_g>'] * len(
                mel_batch) if speaker_id_batch is None else speaker_id_batch

            for j, mel_file in enumerate(mel_batch):
                if texts is None:
                    file.write('{}|{}\n'.format(mel_file, audio_files[j],
                                                speaker_logs[j]))
                else:
                    file.write('{}|{}|{}\n'.format(texts[i][j], mel_file,
                                                   audio_files[j],
                                                   speaker_logs[j]))

    log('synthesized audio waveforms at {}'.format(wav_dir))
Esempio n. 17
0
def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer,
              hparams, model_name):
    '''Evaluate model during training.
  Supposes that model variables are averaged.
  '''
    start_time = time.time()
    y_hat, y_target, loss, input_mel, upsampled_features = sess.run([
        model.tower_y_hat[0], model.tower_y_target[0], model.eval_loss,
        model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]
    ])
    duration = time.time() - start_time
    log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'
        .format(len(y_target), duration,
                len(y_target) / duration))

    #Make audio and plot paths
    pred_wav_path = os.path.join(wav_dir,
                                 'step-{}-pred.wav'.format(global_step))
    target_wav_path = os.path.join(wav_dir,
                                   'step-{}-real.wav'.format(global_step))
    plot_path = os.path.join(plot_dir,
                             'step-{}-waveplot.png'.format(global_step))
    mel_path = os.path.join(
        plot_dir,
        'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
    upsampled_path = os.path.join(
        plot_dir, 'step-{}-upsampled-features.png'.format(global_step))

    #Save figure
    util.waveplot(plot_path,
                  y_hat,
                  y_target,
                  model._hparams,
                  title='{}, {}, step={}, loss={:.5f}'.format(
                      model_name, time_string(), global_step, loss))
    log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))

    #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
    #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)
    generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
    util.plot_spectrogram(
        generated_mel,
        mel_path,
        title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'
        .format(global_step, loss),
        target_spectrogram=input_mel.T)
    util.plot_spectrogram(
        upsampled_features.T,
        upsampled_path,
        title='Upsampled Local Condition features, step={}, loss={:.5f}'.
        format(global_step, loss),
        auto_aspect=True)

    #Save Audio
    save_wavenet_wav(y_hat,
                     pred_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)
    save_wavenet_wav(y_target,
                     target_wav_path,
                     sr=hparams.sample_rate,
                     inv_preemphasize=hparams.preemphasize,
                     k=hparams.preemphasis)

    #Write eval summary to tensorboard
    log('Writing eval summary!')
    add_test_stats(summary_writer, global_step, loss, hparams=hparams)
Esempio n. 18
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--caching_dir',
                        default='/datasets/models/tacotron/cache')
    parser.add_argument('--checkpoint',
                        default='pretrained/',
                        help='Path to model checkpoint')
    parser.add_argument(
        '--hparams',
        default='',
        help=
        'Hyperparameter overrides as a comma-separated list of name=value pairs'
    )
    #parser.add_argument('--name', help='Name of logging directory if the two models were trained together.')
    #parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately')
    #parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately')
    #parser.add_argument('--model', default='Tacotron-2')
    #parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
    #parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet')
    #parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
    accepted_modes = ['eval', 'synthesis', 'live']
    parser.add_argument(
        '--mode',
        default='eval',
        help='mode of run: can be one of {}'.format(accepted_modes))
    parser.add_argument(
        '--GTA',
        default='True',
        help=
        'Ground truth aligned synthesis, defaults to True, only considered in synthesis mode'
    )
    parser.add_argument(
        '--text_list',
        default='',
        help=
        'Text file contains list of texts to be synthesized. Valid if mode=eval'
    )
    parser.add_argument(
        '--speaker_id',
        default=None,
        help=
        'Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids'
    )
    args = parser.parse_args()

    if args.mode not in accepted_modes:
        raise ValueError('accepted modes are: {}, found {}'.format(
            accepted_modes, args.mode))

    if args.GTA not in ('True', 'False'):
        raise ValueError('GTA option must be either True or False')

    modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    from src.tac.training.tacotron_training import get_log_dir
    from src.tac.training.tacotron_training import get_save_dir
    taco_log_dir = get_log_dir(args.caching_dir)
    tacotron_pretrained = get_save_dir(taco_log_dir)

    #run_name = args.name or args.tacotron_name or args.model
    #taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint)

    sentences = get_sentences(args)
    log('Synthesizing mel-spectrograms from text..')
    run_eval(args, tacotron_pretrained, modified_hp, sentences)
Esempio n. 19
0
def train(log_dir, args, hparams):
    save_dir = get_save_dir(log_dir)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'wavenet_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')

    gta = args.GTA == 'True'
    synth_dir = get_synth_dir(args.caching_dir, gta)
    gta_map_file = get_gta_map_file(synth_dir)

    log('Checkpoint_path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(gta_map_file))
    log('Using model: {}'.format('WaveNet'))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.wavenet_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, gta_map_file, hparams)

    #Set up model
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Speaker Embeddings metadata
    if hparams.speakers_path is not None:
        speaker_embedding_meta = hparams.speakers_path

    else:
        speaker_embedding_meta = os.path.join(meta_folder,
                                              'SpeakerEmbeddings.tsv')
        if not os.path.isfile(speaker_embedding_meta):
            with open(speaker_embedding_meta, 'w', encoding='utf-8') as f:
                for speaker in hparams.speakers:
                    f.write('{}\n'.format(speaker))

        speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..')

    #book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    sh_saver = create_shadow_saver(model, global_step)

    log('Wavenet training set to a maximum of {} steps'.format(
        args.wavenet_train_steps))

    #Memory allocation on the memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    run_init = False

    #Train
    sess = tf.Session(config=config)

    summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
    sess.run(tf.global_variables_initializer())

    #saved model restoring
    if args.restore:
        # Restore saved model if the user requested it, default = True
        try:
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path),
                    slack=True)
                load_averaged_model(sess, sh_saver,
                                    checkpoint_state.model_checkpoint_path)
            else:
                log('No model to load at {}'.format(save_dir), slack=True)
                if hparams.wavenet_weight_normalization:
                    run_init = True

        except tf.errors.OutOfRangeError as e:
            log('Cannot restore checkpoint: {}'.format(e), slack=True)
    else:
        log('Starting new training!', slack=True)
        if hparams.wavenet_weight_normalization:
            run_init = True

    if run_init:
        log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..'
            )
        #Create init_model
        init_model, _ = model_train_mode(args,
                                         feeder,
                                         hparams,
                                         global_step,
                                         init=True)

    #initializing feeder
    feeder.start_threads(sess)

    if run_init:
        #Run one forward pass for model parameters initialization (make prediction on init_batch)
        _ = sess.run(init_model.tower_y_hat)
        log('Data dependent initialization done. Starting training!')

    #Training loop
    while not coord.should_stop() and step < args.wavenet_train_steps:
        start_time = time.time()
        step, loss, opt = sess.run([global_step, model.loss, model.optimize])
        time_window.append(time.time() - start_time)
        loss_window.append(loss)

        message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
            step, time_window.average, loss, loss_window.average)
        log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

        if np.isnan(loss) or loss > 100:
            log('Loss exploded to {:.5f} at step {}'.format(loss, step))
            raise Exception('Loss exploded')

        if step % args.summary_interval == 0:
            log('\nWriting summary at step {}'.format(step))
            summary_writer.add_summary(sess.run(stats), step)

        if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
            save_log(sess,
                     step,
                     model,
                     plot_dir,
                     wav_dir,
                     hparams=hparams,
                     model_name='WaveNet')
            save_checkpoint(sess, sh_saver, checkpoint_path, global_step)

        if step % args.eval_interval == 0:
            log('\nEvaluating at step {}'.format(step))
            eval_step(sess,
                      step,
                      eval_model,
                      eval_plot_dir,
                      eval_wav_dir,
                      summary_writer=summary_writer,
                      hparams=model._hparams,
                      model_name='WaveNet')

        if hparams.gin_channels > 0 and (step % args.embedding_interval == 0
                                         or step == args.wavenet_train_steps
                                         or step == 1):
            #Get current checkpoint state
            checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            #Update Projector
            log('\nSaving Model Speaker Embeddings visualization..')
            add_embedding_stats(summary_writer, [model.embedding_table.name],
                                [speaker_embedding_meta],
                                checkpoint_state.model_checkpoint_path)
            log('WaveNet Speaker embeddings have been updated on tensorboard!')

    log('Wavenet training complete after {} global steps'.format(
        args.wavenet_train_steps),
        slack=True)
    coord.request_stop()
    coord.wait_for_stop()

    try:
        sess.close()
        tf.reset_default_graph()
    except:
        log("Session bug occured.")
        # except Exception as e:
        #   log('Exiting due to exception: {}'.format(e), slack=True)
        #   traceback.print_exc()
        #   coord.request_stop(e)
        #   coord.wait_for_stop()
        #   raise Exception('Exception occured.')

    sleep(0.5)