Beispiel #1
0
    def predict(self, inputs):
        """
        Change Mel to Waveform.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        Returns
        -------
        result: List
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]
        padded, lens = sequence_1d(inputs, return_len=True)
        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={self._input_nodes['Placeholder']: padded},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [padded])
            r = self._eager_g(**inputs)[0]
        return r[:, :, 0]
Beispiel #2
0
    def predict(self, string, **kwargs):
        """
        Change string to Mel.

        Parameters
        ----------
        string: str

        Returns
        -------
        result: Dict[string, decoder-output, postnet-output, alignment]
        """

        t, ids = self._normalizer.normalize(string, **kwargs)
        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes,
                feed_dict={
                    self._input_nodes['Placeholder']: [ids],
                    self._input_nodes['Placeholder_1']: [len(ids)],
                },
            )
            return {
                'string': t,
                'ids': ids,
                'decoder-output': r['decoder_output'][0],
                'postnet-output': r['post_mel_outputs'][0],
                'alignment': r['alignment_histories'][0],
            }
        else:
            inputs = to_tf(
                self._eager_g.inputs,
                [np.array([ids]), np.array([len(ids)])])
            r = self._eager_g(**inputs)
            return r
Beispiel #3
0
    def predict(self, input):
        """
        Enhance inputs, will return waveform.

        Parameters
        ----------
        input: np.array
            np.array or malaya_speech.model.frame.Frame.

        Returns
        -------
        result: np.array
        """
        if isinstance(input, Frame):
            input = input.array

        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={self._input_nodes['Placeholder']: input},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [input])
            r = self._eager_g(**inputs)[0].numpy()

        return r
Beispiel #4
0
    def vectorize(self, inputs):
        """
        Vectorize inputs.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].

        Returns
        -------
        result: np.array
            returned [B, D].
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        inputs = [self._vectorizer(input) for input in inputs]
        inputs, lengths = padding_sequence_nd(inputs, dim=0, return_len=True)

        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={
                    self._input_nodes['Placeholder']: inputs,
                    self._input_nodes['Placeholder_1']: lengths,
                },
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [inputs, lengths])
            r = self._eager_g(**inputs)[0].numpy()
        return r
Beispiel #5
0
    def predict(self, input):
        """
        Enhance inputs, will return waveform.

        Parameters
        ----------
        input: np.array
            np.array or malaya_speech.model.frame.Frame.

        Returns
        -------
        result: Dict
        """
        if isinstance(input, Frame):
            input = input.array

        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes,
                feed_dict={self._input_nodes['Placeholder']: input},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [input])
            r = self._eager_g(**inputs)
            r = {f'logits_{no}': i.numpy() for no, i in enumerate(r)}
        results = {}
        for no, instrument in enumerate(self._instruments):
            results[instrument] = r[f'logits_{no}']
        return results
Beispiel #6
0
    def predict(self, inputs):
        """
        Enhance inputs, will return melspectrogram.

        Parameters
        ----------
        inputs: List[np.array]

        Returns
        -------
        result: List
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]
        mels = [featurization.scale_mel(s).T for s in inputs]
        x, lens = padding_sequence_nd(mels, maxlen=256, dim=0, return_len=True)
        if self._output_nodes:
            l = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={self._input_nodes['Placeholder']: x},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [x])
            l = self._eager_g(**inputs)[0].numpy()

        results = []
        for index in range(len(x)):
            results.append(
                featurization.unscale_mel(x[index, :lens[index]].T +
                                          l[index, :lens[index], :, 0].T))
        return results
Beispiel #7
0
    def predict(self, original_audio, target_audio):
        """
        Change original voice audio to follow targeted voice.

        Parameters
        ----------
        original_audio: np.array or malaya_speech.model.frame.Frame
        target_audio: np.array or malaya_speech.model.frame.Frame

        Returns
        -------
        result: Dict[decoder-output, postnet-output]
        """
        original_audio = (input.array if isinstance(original_audio, Frame) else
                          original_audio)
        target_audio = (input.array
                        if isinstance(target_audio, Frame) else target_audio)

        original_mel = self._waveform_to_mel(original_audio)
        target_mel = self._waveform_to_mel(target_audio)

        original_v = self._speaker_vector([original_audio])[0] * 30 - 3.5
        target_v = self._speaker_vector([target_audio])[0] * 30 - 3.5

        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes,
                feed_dict={
                    self._input_nodes['mel']: [original_mel],
                    self._input_nodes['ori_vector']: [original_v],
                    self._input_nodes['target_vector']: [target_v],
                    self._input_nodes['mel_lengths']: [len(original_mel)],
                },
            )
            r = {
                'decoder-output': r['mel_before'][0],
                'postnet-output': r['mel_after'][0],
            }
        else:
            inputs = to_tf(
                self._eager_g.inputs,
                [
                    np.array([original_mel]),
                    np.array([original_v]),
                    np.array([target_v]),
                    np.array([len(original_mel)]),
                ],
            )
            r = self._eager_g(**inputs)
            r = {
                'decoder-output': r[0][0].numpy(),
                'postnet-output': r[1][0].numpy(),
            }

        return r
Beispiel #8
0
    def predict_proba(self, inputs):
        """
        Predict inputs, will return probability.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].

        Returns
        -------
        result: np.array
            returned [B, D].
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        inputs = [self._vectorizer(input, **self._extra) for input in inputs]
        if self.__model__ == 'deep-speaker':
            dim = 0
        else:
            dim = 1
        inputs = padding_sequence_nd(inputs, dim=dim)
        inputs = np.expand_dims(inputs, -1)

        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={self._input_nodes['Placeholder']: inputs},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [inputs])
            r = self._eager_g(**inputs)[0].numpy()

        return r