Esempio n. 1
0
	def _get_next_example(self):
		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
		"""
		if self._train_offset >= len(self._train_meta):
			self._train_offset = 0
			np.random.shuffle(self._train_meta)

		meta = self._train_meta[self._train_offset]
		self._train_offset += 1

		text = meta[6]

		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
		#Create parallel sequences containing zeros to represent a non finished sequence
		token_target = np.asarray([0.] * (len(mel_target) - 1))
		embed_target = np.load(os.path.join(self._embed_dir, meta[3]))
		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
		return input_data, mel_target, token_target, linear_target, embed_target, len(mel_target)
Esempio n. 2
0
    def my_synthesize(self, speaker_embeds, texts):
        """
        Lighter synthesis function that directly returns the mel spectrograms.
        """

        # Prepare the input
        # print('111111',speaker_embeds)
        # print('111111',speaker_embeds[0].shape)
        # print('2222222',texts)
        #speaker_embeds = tf.Session().run(speaker_embeds)
        #print('3333333',speaker_embeds)
        cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        input_seqs, max_seq_len = self._prepare_inputs(seqs)
        split_infos = [[max_seq_len, 0, 0, 0]]
        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
            self.split_infos: np.asarray(split_infos, dtype=np.int32),
            self.speaker_embeddings: speaker_embeds
        }

        # Forward it
        mels, alignments, stop_tokens = self.session.run(
            [self.mel_outputs, self.alignments, self.stop_token_prediction],
            feed_dict=feed_dict)
        mels, alignments, stop_tokens = list(
            mels[0]), alignments[0], stop_tokens[0]

        # Trim the output
        for i in range(len(mels)):
            try:
                target_length = list(np.round(stop_tokens[i])).index(1)
                mels[i] = mels[i][:target_length, :]
            except ValueError:
                # If no token is generated, we simply do not trim the output
                continue

        return [mel.T for mel in mels], alignments
Esempio n. 3
0
    def __getitem__(self, index):
        # Sometimes index may be a list of 2 (not sure why this happens)
        # If that is the case, return a single item corresponding to first element in index
        if index is list:
            index = index[0]

        mel_path, embed_path = self.samples_fpaths[index]
        mel = np.load(mel_path).T.astype(np.float32)

        # Load the embed
        embed = np.load(embed_path)

        # Get the text and clean it
        text = text_to_sequence(self.samples_texts[index],
                                self.hparams.tts_cleaner_names)

        # Convert the list returned by text_to_sequence to a numpy array
        text = np.asarray(text).astype(np.int32)

        return text, mel.astype(np.float32), embed.astype(np.float32), index
Esempio n. 4
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames,
                   embed_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        feed_dict[self.speaker_embeddings] = [
            np.load(f) for f in embed_filenames
        ]

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            raise NotImplemented()

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        "mel-{}.npy".format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   "wavs/wav-{}-mel.wav".format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        "plots/alignment-{}.png".format(
                                            basenames[i])),
                                    title="{}".format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 "plots/mel-{}.png".format(basenames[i])),
                    title="{}".format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       "wavs/wav-{}-linear.wav".format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              "plots/linear-{}.png".format(
                                                  basenames[i])),
                                          title="{}".format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths
Esempio n. 5
0
    def synthesize_spectrograms(self,
                                texts: List[str],
                                embeddings: Union[np.ndarray,
                                                  List[np.ndarray]],
                                return_alignments=False):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
        :param return_alignments: if True, a matrix representing the alignments between the 
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
        sequence length of spectrogram i, and possibly the alignments.
        """
        # Load the model on the first request.
        if not self.is_loaded():
            self.load()

            # Print some info about the model when it is loaded
            tts_k = self._model.get_step() // 1000

            simple_table([("Tacotron", str(tts_k) + "k"),
                          ("r", self._model.r)])

        # Preprocess text inputs
        inputs = [
            text_to_sequence(text.strip(), hparams.tts_cleaner_names)
            for text in texts
        ]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [
            inputs[i:i + hparams.synthesis_batch_size]
            for i in range(0, len(inputs), hparams.synthesis_batch_size)
        ]
        batched_embeds = [
            embeddings[i:i + hparams.synthesis_batch_size]
            for i in range(0, len(embeddings), hparams.synthesis_batch_size)
        ]

        specs = []
        for i, batch in enumerate(batched_inputs, 1):
            if self.verbose:
                print(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i - 1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(self.device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(
                self.device)

            # Inference
            _, mels, alignments = self._model.generate(chars,
                                                       speaker_embeddings)
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hparams.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        if self.verbose:
            print("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs