def __init__(self, coordinator, metadata_filename, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._offset = 0

        # Load metadata:
        self._datadir = os.path.dirname(metadata_filename)
        #with open(metadata_filename, encoding='utf-8') as f:
        with open('jenie_Processed/amused/train.txt', encoding='utf-8') as f:
            #for line in f:
            #  self._metadata = line.strip().split('|')
            self._metadata = [line.strip().split('|') for line in f]
            hours = sum(
                (int(x[2])
                 for x in self._metadata)) * hparams.frame_shift_ms / (3600 *
                                                                       1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets')
        ]

        # Create queue for buffering data:
        queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32],
                             name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            print(cmudict_path)
            #cmudict_path = './cmudict-0.7b'
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
Esempio n. 2
0
  def __init__(self, coordinator, metadata_filename, hparams):
    super(DataFeeder, self).__init__()
    self._coord = coordinator
    self._hparams = hparams
    self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    self._offset = 0

    # Load metadata:
    self._datadir = os.path.dirname(metadata_filename)
    with open(metadata_filename, encoding='utf-8') as f:
      self._metadata = [line.strip().split('|') for line in f]
      hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
      log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))

    # Create placeholders for inputs and targets. Don't specify batch size because we want to
    # be able to feed different sized batches at eval time.
    self._placeholders = [
      tf.placeholder(tf.int32, [None, None], 'inputs'),
      tf.placeholder(tf.int32, [None], 'input_lengths'),
      tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
      tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets')
    ]

    # Create queue for buffering data:
    queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
    self._enqueue_op = queue.enqueue(self._placeholders)
    self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue()
    self.inputs.set_shape(self._placeholders[0].shape)
    self.input_lengths.set_shape(self._placeholders[1].shape)
    self.mel_targets.set_shape(self._placeholders[2].shape)
    self.linear_targets.set_shape(self._placeholders[3].shape)
Esempio n. 3
0
    def __init__(self, coordinator, metadata_filename, hparams):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._offset = 0
        self._datadir = os.path.dirname(metadata_filename)  # train.txt

        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|') for line in f]
            hours = sum(
                (int(x[2])
                 for x in self._metadata)) * hparams.frame_shift_ms / (3600 *
                                                                       1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets')
        ]

        queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32],
                             name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue(
        )  #선입선출
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.mel_targets.set_shape(self._placeholders[2].shape)
        self.linear_targets.set_shape(self._placeholders[3].shape)
Esempio n. 4
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self._hparams.batch_size
        r = self._hparams.outputs_per_step
        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]

        if self.cache_targets and self._num_cached != len(
                self._cached_mel_targets):
            self._num_cached = len(self._cached_mel_targets)
            log('Cached %d targets' % self._num_cached)

        # Bucket examples based on similar output sequence length for efficiency:
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        random.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
Esempio n. 5
0
def wavenet_synthesize(args, hparams, checkpoint):
    output_dir = 'wavenet_' + args.output_dir

    try:
        checkpoint_path = tf.train.get_checkpoint_state(
            checkpoint).model_checkpoint_path
        log('loaded model at {}'.format(checkpoint_path))
    except AttributeError:
        #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
        if 'Both' in checkpoint:
            checkpoint = checkpoint.replace('Both', 'Tacotron-2')
        elif 'Tacotron-2' in checkpoint:
            checkpoint = checkpoint.replace('Tacotron-2', 'Both')
        else:  #Synthesizing separately
            raise AssertionError(
                'Cannot restore checkpoint: {}, did you train a model?'.format(
                    checkpoint))

        try:
            #Try loading again
            checkpoint_path = tf.train.get_checkpoint_state(
                checkpoint).model_checkpoint_path
            log('loaded model at {}'.format(checkpoint_path))
        except:
            raise RuntimeError(
                'Failed to load checkpoint at {}'.format(checkpoint))

    run_synthesis(args, checkpoint_path, output_dir, hparams)
Esempio n. 6
0
    def initialize(self,
                   input_lengths,
                   linear_targets,
                   ppgs=None,
                   mel_targets=None,
                   speakers=None):

        with tf.variable_scope('inference') as scope:
            is_training = ppgs is not None
            hp = self._hparams

            # Pre-net: [batch, time, feat]
            # encoder_steps = tf.gather(input_lengths, tf.argmax(input_lengths))
            prenet_outputs = prenet(linear_targets, is_training,
                                    hp.prenet_depths)
            post_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                        is_training, hp.encoder_depth,
                                        'cbhg_ppgs')  # [80->128]
            logits = tf.layers.dense(post_outputs,
                                     hp.num_ppgs,
                                     name='pred_ppgs')  # [128->80]
            pred_ppgs = tf.nn.softmax(logits, name='ppgs')

            self.speakers = speakers
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.input_lengths = input_lengths
            self.ppgs = ppgs

            self.logits = logits
            self.pred_ppgs = pred_ppgs

            log('Initialized NNet1 model. Dimensions: ')
            log('  pred_ppgs:               {}'.format(pred_ppgs.shape))
Esempio n. 7
0
    def __init__(self, metadata_filename, hparams):
        self._hparams = hparams

        # Load metadata:
        self._datadir = os.path.dirname(metadata_filename)
        # with open(metadata_filename) as f:
        with open(metadata_filename, encoding="utf-8_sig") as f:
            self._metadata = [line.strip().split('|') for line in f]
            hours = sum((int(x[2]) for x in self._metadata)) * \
                hparams.frame_shift_ms / (3600 * 1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # synthesis (useful for proper nouns, etc.)
        if hparams.use_cmudict:
            cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
            if not os.path.isfile(cmudict_path):
                raise Exception(
                    'If use_cmudict=True, you must download ' +
                    'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'
                    % cmudict_path)
            self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
            log('Loaded CMUDict with %d unambiguous entries' %
                len(self._cmudict))
        else:
            self._cmudict = None
def run_eval(args, checkpoint_path, output_dir, sentences, reference_mel):
    eval_dir = os.path.join(output_dir, 'eval')
    log_dir = os.path.join(output_dir, 'logs-eval')

    assert os.path.normpath(eval_dir) == os.path.normpath(
        args.mels_dir)  #mels_dir = wavenet_input_dir

    #Create output path if it doesn't exist
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
    os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
    print(sentences)
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, reference_mel=reference_mel)
    with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
        for i, text in enumerate(tqdm(sentences)):
            start = time.time()
            mel_filename = synth.synthesize(text,
                                            i + 1,
                                            eval_dir,
                                            log_dir,
                                            None,
                                            reference_mel=reference_mel)

            file.write('{}|{}\n'.format(text, mel_filename))
    log('synthesized mel spectrograms at {}'.format(eval_dir))
    return eval_dir
Esempio n. 9
0
    def update(self, hparams):
        with tf.variable_scope('inference') as scope:
            self._hparams = hparams
            hp = self._hparams
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(self.output_cell, self.helper,
                              self.decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [self.batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs,
                                     hp.num_mels,
                                     is_training=False,
                                     is_updating=True)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq,
                                             reuse=True)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            log('Updated Tacotron model.')
    def load(self, checkpoint_path, hparams, model_name='WaveNet'):
        log('Constructing model: {}'.format(model_name))
        self._hparams = hparams
        local_cond, global_cond = self._check_conditions()

        self.local_conditions = tf.placeholder(
            tf.float32,
            shape=[1, None, hparams.num_mels],
            name='local_condition_features') if local_cond else None
        self.global_conditions = tf.placeholder(
            tf.int32, shape=(),
            name='global_condition_features') if global_cond else None
        self.synthesis_length = tf.placeholder(
            tf.int32, shape=(),
            name='synthesis_length') if not local_cond else None

        with tf.variable_scope('model') as scope:
            self.model = create_model(model_name, hparams)
            self.model.initialize(y=None,
                                  c=self.local_conditions,
                                  g=self.global_conditions,
                                  input_lengths=None,
                                  synthesis_length=self.synthesis_length)

            self._hparams = hparams
            sh_saver = create_shadow_saver(self.model)

            log('Loading checkpoint: {}'.format(checkpoint_path))
            self.session = tf.Session()
            self.session.run(tf.global_variables_initializer())
            load_averaged_model(self.session, sh_saver, checkpoint_path)
Esempio n. 11
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self._hparams.batch_size
        r = self._hparams.reduction_factor

        if self.static_batches is not None:
            batches = self.static_batches
        else:
            examples = []
            for data_dir in self._datadir:
                if self._hparams.initial_data_greedy:
                    if self._step < self._hparams.initial_phase_step and \
                            any("krbook" in data_dir for data_dir in self._datadir):
                        data_dir = [
                            data_dir for data_dir in self._datadir
                            if "krbook" in data_dir
                        ][0]

                if self._step < self._hparams.initial_phase_step:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group // len(self._datadir)))]
                else:
                    example = [self._get_next_example(data_dir) \
                            for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
                examples.extend(example)
            examples.sort(key=lambda x: x[-1])

            batches = [examples[i:i + n] for i in range(0, len(examples), n)]
            self.rng.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(
                zip(self._placeholders,
                    _prepare_batch(batch, r, self.rng, self.data_type)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
            self._step += 1

        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]

        # Bucket examples based on similar output sequence length for efficiency:
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        random.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
            self._step += 1
Esempio n. 12
0
	def make_test_batches(self):
		start = time.time()

		#Read one example for evaluation
		n = 1

		#Test on entire test set (one sample at an evaluation step)
		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
		np.random.shuffle(batches)

		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
		return batches
Esempio n. 13
0
def synthesize(args, hparams, gst_checkpoint, wave_checkpoint, sentences, reference_mel):
	log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name))
	log('Synthesizing mel-spectrograms from text..')
	wavenet_in_dir = gst_synthesize(args, gst_checkpoint, sentences, reference_mel)
	log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
	wavenet_synthesize(args, hparams, wave_checkpoint)
	log('Tacotron-2 TTS synthesis complete!')
def run_synthesis(args, checkpoint_path, output_dir):

    _p_cmudict = 0.5
    GTA = (args.GTA == 'True')
    if GTA:
        synth_dir = os.path.join(output_dir, 'gta')

        #Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)
    else:
        synth_dir = os.path.join(output_dir, 'natural')

        #Create output path if it doesn't exist
        os.makedirs(synth_dir, exist_ok=True)
    metadata_filename = os.path.join(args.input_dir, 'train.txt')

    if hparams.use_cmudict:
        cmudict_path = os.path.join(os.path.dirname(metadata_filename),
                                    'cmudict-0.7b')
        if not os.path.isfile(cmudict_path):
            raise Exception(
                'If use_cmudict=True, you must download cmu dictionary first. '
                +
                'Run shell as:\n wget -P %s http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b'
                % self._datadir)
        _cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
        log('Loaded CMUDict with %d unambiguous entries' % len(_cmudict))
    else:
        _cmudict = None

    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, gta=GTA)
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]

        log('starting synthesis')
        mel_dir = os.path.join(args.input_dir, 'mels')
        wav_dir = os.path.join(args.input_dir, 'linear')

        with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
            for i, meta in enumerate(tqdm(metadata)):

                _punctuation_re = re.compile(r'([\.,"\-_:]+)')
                text = re.sub(_punctuation_re, r' \1 ', meta[3])
                if _cmudict and random.random() < _p_cmudict:
                    text = ' '.join([
                        maybe_get_arpabet(_cmudict, word)
                        for word in text.split(' ')
                    ])
                mel_filename = os.path.join(mel_dir, meta[1])
                wav_filename = os.path.join(wav_dir, meta[0])
                mel_output_filename = synth.synthesize(text, i + 1, synth_dir,
                                                       None, mel_filename)
                file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename,
                                                  mel_output_filename, text))
    log('synthesized mel spectrograms at {}'.format(synth_dir))
    return os.path.join(synth_dir, 'map.txt')
Esempio n. 15
0
    def _enqueue_next_group(self):
        start = time.time()
        n = self._hparams.batch_size
        r = self._hparams.outputs_per_step
        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        random.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
Esempio n. 16
0
  def _enqueue_next_group(self):
    start = time.time()

    # Read a group of examples:
    n = self._hparams.batch_size
    r = self._hparams.outputs_per_step
    examples = [self._get_next_example() for i in range(n * _batches_per_group)]

    # Bucket examples based on similar output sequence length for efficiency:
    examples.sort(key=lambda x: x[-1])
    batches = [examples[i:i+n] for i in range(0, len(examples), n)]
    random.shuffle(batches)

    log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
    for batch in batches:
      feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
      self._session.run(self._enqueue_op, feed_dict=feed_dict)
def gst_synthesize(args, checkpoint, sentences=None, reference_mel=None):
    output_dir = "gst_" + args.output_dir
    checkpoint_path = tf.train.get_checkpoint_state(
        checkpoint).model_checkpoint_path

    log('loaded model at {}'.format(checkpoint_path))

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    hparams.parse(args.hparams)
    if args.mode == 'eval':
        return run_eval(args, checkpoint_path, output_dir, sentences,
                        reference_mel)
    elif args.mode == 'synthesis':
        return run_synthesis(args, checkpoint_path, output_dir)
    else:
        run_live(args, checkpoint_path)
Esempio n. 18
0
	def _enqueue_next_train_group(self):
		while not self._coord.should_stop():
			start = time.time()

			# Read a group of examples
			n = self._hparams.wavenet_batch_size
			examples = [self._get_next_example() for i in range(n * _batches_per_group)]

			# Bucket examples base on similiar output length for efficiency
			examples.sort(key=lambda x: x[-1])
			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
			np.random.shuffle(batches)

			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
			for batch in batches:
				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
				self._session.run(self._enqueue_op, feed_dict=feed_dict)
Esempio n. 19
0
  def __init__(self, coordinator, metadata_filename, hparams):
    super(DataFeeder, self).__init__()
    self._coord = coordinator
    self._hparams = hparams
    self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    self._offset = 0

    # Load metadata:
    self._datadir = os.path.dirname(metadata_filename)
    with open(metadata_filename, encoding='utf-8') as f:
      self._metadata = [line.strip().split('|') for line in f]
      hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
      log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))

    # Create placeholders for inputs and targets. Don't specify batch size because we want to
    # be able to feed different sized batches at eval time.
    self._placeholders = [
      tf.placeholder(tf.int32, [None, None], 'inputs'),
      tf.placeholder(tf.int32, [None], 'input_lengths'),
      tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
      tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets')
    ]

    # Create queue for buffering data:
    queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
    self._enqueue_op = queue.enqueue(self._placeholders)
    self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue()
    self.inputs.set_shape(self._placeholders[0].shape)
    self.input_lengths.set_shape(self._placeholders[1].shape)
    self.mel_targets.set_shape(self._placeholders[2].shape)
    self.linear_targets.set_shape(self._placeholders[3].shape)

    # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
    # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
    # synthesis (useful for proper nouns, etc.)
    if hparams.use_cmudict:
      cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
      if not os.path.isfile(cmudict_path):
        raise Exception('If use_cmudict=True, you must download ' +
          'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'  % cmudict_path)
      self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
      log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
    else:
      self._cmudict = None
Esempio n. 20
0
  def _enqueue_next_group(self):
    start = time.time()

    # Read a group of examples:
    n = self._hparams.batch_size
    r = self._hparams.outputs_per_step
    examples = [self._get_next_example() for i in range(n * _batches_per_group)]

    # Bucket examples based on similar output sequence length for efficiency:
    examples.sort(key=lambda x: x[-1])
    batches = [examples[i:i+n] for i in range(0, len(examples), n)]
    # shape = (batch_size, _batches_per_group, [input, mel, linear, len])
    # batches as one batch group
    random.shuffle(batches)

    log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
    for batch in batches:
      feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
      self._session.run(self._enqueue_op, feed_dict=feed_dict)
Esempio n. 21
0
    def make_test_batches(self):
        start = time.time()

        # Read a group of examples
        n = self._hparams.batch_size
        r = self._hparams.outputs_per_step

        #Test on entire test set
        examples = [
            self._get_test_groups() for i in range(len(self._test_meta))
        ]

        # Bucket examples based on similar output sequence length for efficiency
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        np.random.shuffle(batches)

        log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(
            len(batches), n,
            time.time() - start))
        return batches, r
Esempio n. 22
0
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self._hparams.batch_size
        r = self._hparams.outputs_per_step
        examples = [
            self._get_next_example() for i in range(n * _batches_per_group)
        ]

        # Bucket examples based on similar output sequence length for efficiency:
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        random.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            # sample in batch> x[0]: input_fea, x[1]: lpc_target, x[2]: stop_token_target, x[3]: lpc_target_len.
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
    def _enqueue_next_group(self):
        start = time.time()

        # Read a group of examples:
        n = self._hparams.batch_size
        r = self._hparams.outputs_per_step
        examples = []
        for data_path in self.data_paths:
            example = [
                self._get_next_example(data_path) for i in range(
                    int(n * _batches_per_group // len(self.data_paths)))
            ]
            examples.extend(example)

        # Bucket examples based on similar output sequence length for efficiency:
        examples.sort(key=lambda x: x[-1])
        batches = [examples[i:i + n] for i in range(0, len(examples), n)]
        random.shuffle(batches)

        log('Generated %d batches of size %d in %.03f sec' %
            (len(batches), n, time.time() - start))
        for batch in batches:
            feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
            self._session.run(self._enqueue_op, feed_dict=feed_dict)
Esempio n. 24
0
def get_path_dict(data_dirs,
                  hparams,
                  config,
                  data_type,
                  n_test=None,
                  rng=np.random.RandomState(123)):

    # Load metadata:
    path_dict = {}
    for data_dir in data_dirs:
        paths = glob("{}/*.npz".format(data_dir))

        if data_type == 'train':
            rng.shuffle(paths)

        if not config.skip_path_filter:
            items = parallel_run(get_frame,
                                 paths,
                                 desc="filter_by_min_max_frame_batch",
                                 parallel=True)

            min_n_frame = hparams.reduction_factor * hparams.min_iters
            max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor

            new_items = [(path, n) for path, n, n_tokens in items \
                    if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens]

            if any(check in data_dir for check in ["son", "yuinna"]):
                blacklists = [".0000.", ".0001.", "NB11479580.0001"]
                new_items = [item for item in new_items \
                        if any(check not in item[0] for check in blacklists)]

            new_paths = [path for path, n in new_items]
            new_n_frames = [n for path, n in new_items]

            hours = frames_to_hours(new_n_frames)

            log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \
                    format(data_dir, len(new_n_frames), hours))
            log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
            log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
        else:
            new_paths = paths

        if data_type == 'train':
            new_paths = new_paths[:-n_test]
        elif data_type == 'test':
            new_paths = new_paths[-n_test:]
        else:
            raise Exception(" [!] Unkown data_type: {}".format(data_type))

        path_dict[data_dir] = new_paths

    return path_dict
Esempio n. 25
0
    def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):

        with tf.variable_scope('embedding') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, embed_depth=512]

        with tf.variable_scope('encoder') as scope:
            x = embedded_inputs
            for i in range(hp.encoder_stack_size):
                x = tf.layers.conv1d(x,
                                     filters=hp.encoder_conv_filter,
                                     kernel_size=hp.encoder_conv_kernel,
                                     padding='same',
                                     activation=tf.nn.relu)
                x = tf.layers.batch_normalization(x, training=is_training)

            lstm_fw = LSTMCell(hp.encoder_lstm_hidden_dim)
            lstm_bw = LSTMCell(hp.encoder_lstm_hidden_dim)

            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw,
                                                              lstm_bw,
                                                              encoder_conv_output,
                                                              sequence_length=input_lengths,
                                                              dtype=tf.float32) # [N, T_in, 512]
            encoder_output = tf.concat(outputs, axis=2)

        # with tf.variable_scope('decoder') as scope:

			
        self.inputs = inputs
        self.input_lengths = input_lengths
        # self.mel_outputs = mel_outputs
        # self.linear_outputs = linear_outputs
        # self.alignments = alignments
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        log('Initialized Tacotron model. Dimensions: ')
        log('  embedding:               %d' % embedded_inputs.shape[-1])
        log('  encoder out:             %d' % encoder_output.shape[-1])
Esempio n. 26
0
def run_synthesis(args, checkpoint_path, output_dir, hparams):
    log_dir = os.path.join(output_dir, 'plots')
    wav_dir = os.path.join(output_dir, 'wavs')

    #We suppose user will provide correct folder depending on training method
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    metadata_filename = os.path.join(args.mels_dir, 'map.txt')
    with open(metadata_filename, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]
        frame_shift_ms = hparams.hop_size / hparams.sample_rate
        hours = sum([int(x[-1]) for x in metadata]) * frame_shift_ms / (3600)
        log('Loaded metadata for {} examples ({:.2f} hours)'.format(
            len(metadata), hours))

    metadata = np.array(metadata)
    mel_files = metadata[:, 1]
    texts = metadata[:, 0]

    log('Starting synthesis! (this will take a while..)')
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)

    with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
        for i, mel_file in enumerate(tqdm(mel_files)):
            mel_spectro = np.load(mel_file)
            audio_file = synth.synthesize(mel_spectro, None, i + 1, wav_dir,
                                          log_dir)

            if texts is None:
                file.write('{}|{}\n'.format(mel_file, audio_file))
            else:
                file.write('{}|{}|{}\n'.format(texts[i], mel_file, audio_file))

    log('synthesized audio waveforms at {}'.format(wav_dir))
def run_live(args, checkpoint_path):
    #Log to Terminal without keeping any records in files
    log(hparams_debug_string())
    synth = Synthesizer()
    synth.load(checkpoint_path, hparams)

    #Generate fast greeting message
    greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
    log(greetings)
    generate_fast(synth, greetings)

    #Interaction loop
    while True:
        try:
            text = input()
            generate_fast(synth, text)

        except KeyboardInterrupt:
            leave = 'Thank you for testing our features. see you soon.'
            log(leave)
            generate_fast(synth, leave)
            sleep(2)
            break
Esempio n. 28
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            # embedding_table = tf.get_variable(
            #   'embedding', [len(symbols), 256], dtype=tf.float32,
            #   initializer=tf.truncated_normal_initializer(stddev=0.5))
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
            # embedded_inputs = inputs

            # Encoder
            # n_fft = (self._hparams.num_src_freq - 1) * 2
            # in_layer_size = n_fft
            in_layer_size = self._hparams.num_src_freq
            prenet_outputs = prenet(inputs,
                                    is_training,
                                    layer_sizes=[in_layer_size,
                                                 128])  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  input:                   %d' % inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Esempio n. 29
0
  def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      is_teacher_force_generating = mel_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      # embedding_table = tf.get_variable(
      #   'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
      #   initializer=tf.truncated_normal_initializer(stddev=0.5))
      # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
      
      if hp.use_gst:
        #Global style tokens (GST)
        gst_tokens = tf.get_variable(
          'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32,
          initializer=tf.truncated_normal_initializer(stddev=0.5))
        self.gst_tokens = gst_tokens
 
      # Encoder
      # prenet_outputs = prenet(embedded_inputs, is_training)
      prenet_outputs = prenet(inputs, is_training)
      # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]
      


      if inputs_jp  is not None:
        # Reference encoder
        refnet_outputs = reference_encoder(
          inputs_jp,
          filters=hp.reference_filters, 
          kernel_size=(3,3),
          strides=(2,2),
          encoder_cell=GRUCell(hp.reference_depth),
          is_training=is_training)                                                 # [N, 128]
        self.refnet_outputs = refnet_outputs                                       

        if hp.use_gst:
          # Style attention
          style_attention = MultiheadAttention(
            tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
            tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

          style_embeddings = style_attention.multi_head_attention()                   # [N, 1, 256]
        else:
          style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
      else:
        print("Use random weight for GST.")
        random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
        random_weights = tf.nn.softmax(random_weights, name="random_weights")
        style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
        style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

      # Add style embedding to every text encoder state
      style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
      encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1)

      # Attention
      attention_cell = AttentionWrapper(
        GRUCell(hp.attention_depth),
        BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
          OutputProjectionWrapper(concat_cell, hp.rnn_depth),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
        ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training or is_teacher_force_generating:
        helper = TacoTrainingHelper(inputs, mel_targets, hp)
      else:
        helper = TacoTestHelper(batch_size, hp)

      (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
        BasicDecoder(output_cell, helper, decoder_init_state),
        maximum_iterations=hp.max_iters)                                        # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.encoder_outputs = encoder_outputs
      self.style_embeddings = style_embeddings
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      self.inputs_jp = inputs_jp
      log('Initialized Tacotron model. Dimensions: ')
      log('  style embedding:         %d' % style_embeddings.shape[-1])
      log('  prenet out:              %d' % prenet_outputs.shape[-1])
      log('  encoder out:             %d' % encoder_outputs.shape[-1])
      log('  attention out:           %d' % attention_cell.output_size)
      log('  concat attn & out:       %d' % concat_cell.output_size)
      log('  decoder cell out:        %d' % decoder_cell.output_size)
      log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      log('  postnet out:             %d' % post_outputs.shape[-1])
      log('  linear out:              %d' % linear_outputs.shape[-1])
Esempio n. 30
0
    def __init__(self, coordinator, metadata_filename, hparams, config,
                 batches_per_group, data_type, batch_size):
        super(DataFeeder, self).__init__()
        self._coord = coordinator
        self._hparams = hparams
        self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        self._step = 0
        self._offset = defaultdict(lambda: 2)
        self._batches_per_group = batches_per_group

        self.rng = np.random.RandomState(config.random_seed)
        self.data_type = data_type
        self.batch_size = batch_size

        self.min_tokens = hparams.min_tokens
        self.min_n_frame = hparams.reduction_factor * hparams.min_iters
        self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
        self.skip_path_filter = config.skip_path_filter

        # Load metadata:
        self._datadir = os.path.dirname(metadata_filename)
        with open(metadata_filename, encoding='utf-8') as f:
            self._metadata = [line.strip().split('|') for line in f]
            hours = sum(
                (int(x[2])
                 for x in self._metadata)) * hparams.frame_shift_ms / (3600 *
                                                                       1000)
            log('Loaded metadata for %d examples (%.2f hours)' %
                (len(self._metadata), hours))

        # Create placeholders for inputs and targets. Don't specify batch size because we want to
        # be able to feed different sized batches at eval time.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32, [None], 'loss_coeff'),
            tf.placeholder(tf.float32, [None, None, hparams.num_mels],
                           'mel_targets'),
            tf.placeholder(tf.float32, [None, None, hparams.num_freq],
                           'linear_targets')
        ]

        # Create queue for buffering data:
        dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32]

        self._placeholders.append(tf.placeholder(tf.int32, [None], 'inputs'), )
        dtypes.append(tf.int32)
        num_worker = 8 if self.data_type == 'train' else 1

        queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue')
        self._enqueue_op = queue.enqueue(self._placeholders)
        self.inputs, self.input_lengths, self.loss_coeff, self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue(
        )
        self.inputs.set_shape(self._placeholders[0].shape)
        self.input_lengths.set_shape(self._placeholders[1].shape)
        self.loss_coeff.set_shape(self._placeholders[2].shape)
        self.mel_targets.set_shape(self._placeholders[3].shape)
        self.linear_targets.set_shape(self._placeholders[4].shape)
        self.speaker_id.set_shape(self._placeholders[5].shape)
        self._cmudict = None

        # # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
        # # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
        # # synthesis (useful for proper nouns, etc.)
        # if hparams.use_cmudict:
        #     cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
        #     if not os.path.isfile(cmudict_path):
        #         raise Exception('If use_cmudict=True, you must download ' +
        #                         'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path)
        #     self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
        #     log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
        # else:
        #     self._cmudict = None

        if self.data_type == 'test':
            examples = []
            while True:
                for data_dir in self._datadir:
                    examples.append(self._get_next_example(data_dir))
                    #print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
                    if len(examples) >= self.batch_size:
                        break
                if len(examples) >= self.batch_size:
                    break
            self.static_batches = [
                examples for _ in range(self._batches_per_group)
            ]

        else:
            self.static_batches = None
Esempio n. 31
0
    def build(self, rgb):
        """
        load variable from npy to build the VGG
        :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1]
        """

        start_time = time.time()
        log('Building VGG19. Started at: %ds' % start_time)
        rgb_scaled = rgb * 255.0

        # Convert RGB to BGR
        red, green, blue = tf.split(axis=3,
                                    num_or_size_splits=3,
                                    value=rgb_scaled)
        assert red.get_shape().as_list()[1:] == [224, 224, 1]
        assert green.get_shape().as_list()[1:] == [224, 224, 1]
        assert blue.get_shape().as_list()[1:] == [224, 224, 1]
        bgr = tf.concat(axis=3,
                        values=[
                            blue - VGG_MEAN[0],
                            green - VGG_MEAN[1],
                            red - VGG_MEAN[2],
                        ])
        assert bgr.get_shape().as_list()[1:] == [224, 224, 3]

        self.conv1_1 = self.conv_layer(bgr, "conv1_1")
        self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2")
        self.pool1 = self.max_pool(self.conv1_2, 'pool1')

        self.conv2_1 = self.conv_layer(self.pool1, "conv2_1")
        self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2")
        self.pool2 = self.max_pool(self.conv2_2, 'pool2')

        self.conv3_1 = self.conv_layer(self.pool2, "conv3_1")
        self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2")
        self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3")
        self.conv3_4 = self.conv_layer(self.conv3_3, "conv3_4")
        self.pool3 = self.max_pool(self.conv3_4, 'pool3')

        self.conv4_1 = self.conv_layer(self.pool3, "conv4_1")
        self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2")
        self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3")
        self.conv4_4 = self.conv_layer(self.conv4_3, "conv4_4")
        self.pool4 = self.max_pool(self.conv4_4, 'pool4')

        self.conv5_1 = self.conv_layer(self.pool4, "conv5_1")
        self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2")
        self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3")
        self.conv5_4 = self.conv_layer(self.conv5_3, "conv5_4")
        self.pool5 = self.max_pool(self.conv5_4, 'pool5')

        self.fc6 = self.fc_layer(self.pool5, "fc6")
        assert self.fc6.get_shape().as_list()[1:] == [4096]
        self.relu6 = tf.nn.relu(self.fc6)

        self.fc7 = self.fc_layer(self.relu6, "fc7")
        self.relu7 = tf.nn.relu(self.fc7)

        self.fc8 = self.fc_layer(self.relu7, "fc8")

        log("finished building VGG19 in %ds" % (time.time() - start_time))

        return self.fc8
Esempio n. 32
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            #Global style tokens (GST)
            gst_tokens = tf.get_variable(
                'style_tokens',
                [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            self.gst_tokens = gst_tokens

            # Encoder

            encoder_outputs = encoder(embedded_inputs, input_lengths,
                                      is_training, 512, 5,
                                      256)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=hp.ref_filters,
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(hp.ref_depth),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                # Style attention
                style_attention = MultiheadAttention(
                    tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                    tf.tanh(
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                    num_heads=hp.num_heads,
                    num_units=hp.style_att_dim,
                    attention_type=hp.style_att_type)

                embedded_tokens = style_attention.multi_head_attention(
                )  # [N, 1, 256]

            else:
                random_weights = tf.constant(
                    hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] *
                                    (hp.num_gst - hp.gst_index)],
                    dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               name="random_weights")
                # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads])
                embedded_tokens = tf.matmul(random_weights,
                                            tf.nn.tanh(gst_tokens))
                embedded_tokens = hp.gst_scale * embedded_tokens
                embedded_tokens = tf.reshape(
                    embedded_tokens, [1, 1] +
                    [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                embedded_tokens,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            encoder_outputs = tf.concat([encoder_outputs, style_embeddings],
                                        axis=-1)

            # Attention
            attention_mechanism = LocationSensitiveAttention(
                128,
                encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=True,
                memory_sequence_length=input_lengths,
                smoothing=False,
                cumulate_weights=True)
            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32)  #tensorflow1에는 없음

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                attention_cell, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)
            else:
                helper = TacoTestHelper(batch_size, hp)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            linear_outputs = tf.layers.dense(
                post_outputs,
                hp.num_freq)  # [N, T_out, F(1025)]             # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            # log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])