Beispiel #1
0
    def predict(self, inputs):
        """
        Change Mel to Waveform.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        Returns
        -------
        result: List
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]
        padded, lens = sequence_1d(inputs, return_len=True)
        if self._output_nodes:
            r = self._sess.run(
                self._output_nodes['logits'],
                feed_dict={self._input_nodes['Placeholder']: padded},
            )
        else:
            inputs = to_tf(self._eager_g.inputs, [padded])
            r = self._eager_g(**inputs)[0]
        return r[:, :, 0]
Beispiel #2
0
    def _get_inputs(self, inputs):
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        padded, lens = sequence_1d(inputs, return_len=True)
        return padded, lens
Beispiel #3
0
    def predict_lm(self, inputs, lm, beam_size: int = 100, **kwargs):
        """
        Transcribe inputs using Beam Search + LM, will return list of strings.
        This method will not able to utilise batch decoding, instead will do loop to decode for each elements.

        Parameters
        ----------
        input: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        lm: ctc_decoders.Scorer
            Returned from `malaya_speech.stt.language_model()`.
        beam_size: int, optional (default=100)
            beam size for beam decoder.
        

        Returns
        -------
        result: List[str]
        """
        try:
            from ctc_decoders import ctc_beam_search_decoder
        except:
            raise ModuleNotFoundError(
                'ctc_decoders not installed. Please install it by `pip install ctc-decoders` and try again.'
            )

        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        padded, lens = sequence_1d(inputs, return_len=True)
        logits, seq_lens = self._sess.run(
            [self._softmax, self._seq_lens],
            feed_dict={
                self._X: padded,
                self._X_len: lens
            },
        )
        logits = np.transpose(logits, axes=(1, 0, 2))
        results = []
        for i in range(len(logits)):
            d = ctc_beam_search_decoder(
                logits[i][:seq_lens[i]],
                self._vocab,
                beam_size,
                ext_scoring_func=lm,
                **kwargs,
            )
            results.append(d[0][1])
        return results
Beispiel #4
0
    def predict(self, inputs):
        """
        Change Mel to Waveform.

        Parameters
        ----------
        inputs: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        Returns
        -------
        result: List
        """
        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]
        padded, lens = sequence_1d(inputs, return_len=True)

        r = self._execute(
            inputs=[padded],
            input_labels=['Placeholder'],
            output_labels=['logits'],
        )
        return r['logits'][:, :, 0]
Beispiel #5
0
    def predict(self,
                inputs,
                decoder: str = 'beam',
                beam_size: int = 5,
                **kwargs):
        """
        Transcribe inputs, will return list of strings.

        Parameters
        ----------
        input: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        decoder: str, optional (default='beam')
            decoder mode, allowed values:

            * ``'greedy'`` - greedy decoder.
            * ``'beam'`` - beam decoder.
        beam_size: int, optional (default=5)
            beam size for beam decoder.

        Returns
        -------
        result: List[str]
        """
        decoder = self._check_decoder(decoder, beam_size)

        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        padded, lens = sequence_1d(inputs, return_len=True)

        if decoder == 'greedy':
            beam_size = 1

        encoded_, padded_lens_ = self._sess.run(
            [self._encoded, self._padded_lens],
            feed_dict={
                self._X_placeholder: padded,
                self._X_len_placeholder: lens,
            },
        )
        padded_lens_ = padded_lens_ // self._time_reduction_factor
        s = self._sess.run(self._initial_states)
        results = []
        for i in range(len(encoded_)):
            r = transducer_beam(
                enc=encoded_[i],
                total=padded_lens_[i],
                initial_states=s,
                encoded_placeholder=self._encoded_placeholder,
                predicted_placeholder=self._predicted_placeholder,
                states_placeholder=self._states_placeholder,
                ytu=self._ytu,
                new_states=self._new_states,
                sess=self._sess,
                beam_width=beam_size,
                **kwargs,
            )
            results.append(subword_decode(self._vocab, r))
        return results
Beispiel #6
0
    def predict(self,
                inputs,
                decoder: str = 'beam',
                beam_size: int = 100,
                **kwargs):
        """
        Transcribe inputs, will return list of strings.

        Parameters
        ----------
        input: List[np.array]
            List[np.array] or List[malaya_speech.model.frame.Frame].
        decoder: str, optional (default='beam')
            decoder mode, allowed values:

            * ``'greedy'`` - greedy decoder.
            * ``'beam'`` - beam decoder.
        beam_size: int, optional (default=100)
            beam size for beam decoder.

        Returns
        -------
        result: List[str]
        """

        decoder = self._check_decoder(decoder, beam_size)

        inputs = [
            input.array if isinstance(input, Frame) else input
            for input in inputs
        ]

        padded, lens = sequence_1d(inputs, return_len=True)

        if decoder == 'greedy':
            beam_size = 1
        if beam_size != self._beam_size:
            self._beam_size = beam_size
            self._decoded = tf.nn.ctc_beam_search_decoder(
                self._logits,
                self._seq_lens,
                beam_width=self._beam_size,
                top_paths=1,
                merge_repeated=True,
                **kwargs,
            )[0][0]

        r = self._sess.run(self._decoded,
                           feed_dict={
                               self._X: padded,
                               self._X_len: lens
                           })
        decoded = np.zeros(r.dense_shape, dtype=np.int32)
        for i in range(r.values.shape[0]):
            decoded[r.indices[i][0], r.indices[i][1]] = r.values[i]

        results = []
        for i in range(len(decoded)):
            results.append(
                char_decode(decoded[i],
                            lookup=self._vocab).replace('<PAD>', ''))
        return results