def predict(self, inputs): """ Change Mel to Waveform. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. Returns ------- result: List """ inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) if self._output_nodes: r = self._sess.run( self._output_nodes['logits'], feed_dict={self._input_nodes['Placeholder']: padded}, ) else: inputs = to_tf(self._eager_g.inputs, [padded]) r = self._eager_g(**inputs)[0] return r[:, :, 0]
def _get_inputs(self, inputs): inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) return padded, lens
def predict_lm(self, inputs, lm, beam_size: int = 100, **kwargs): """ Transcribe inputs using Beam Search + LM, will return list of strings. This method will not able to utilise batch decoding, instead will do loop to decode for each elements. Parameters ---------- input: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. lm: ctc_decoders.Scorer Returned from `malaya_speech.stt.language_model()`. beam_size: int, optional (default=100) beam size for beam decoder. Returns ------- result: List[str] """ try: from ctc_decoders import ctc_beam_search_decoder except: raise ModuleNotFoundError( 'ctc_decoders not installed. Please install it by `pip install ctc-decoders` and try again.' ) inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) logits, seq_lens = self._sess.run( [self._softmax, self._seq_lens], feed_dict={ self._X: padded, self._X_len: lens }, ) logits = np.transpose(logits, axes=(1, 0, 2)) results = [] for i in range(len(logits)): d = ctc_beam_search_decoder( logits[i][:seq_lens[i]], self._vocab, beam_size, ext_scoring_func=lm, **kwargs, ) results.append(d[0][1]) return results
def predict(self, inputs): """ Change Mel to Waveform. Parameters ---------- inputs: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. Returns ------- result: List """ inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) r = self._execute( inputs=[padded], input_labels=['Placeholder'], output_labels=['logits'], ) return r['logits'][:, :, 0]
def predict(self, inputs, decoder: str = 'beam', beam_size: int = 5, **kwargs): """ Transcribe inputs, will return list of strings. Parameters ---------- input: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. decoder: str, optional (default='beam') decoder mode, allowed values: * ``'greedy'`` - greedy decoder. * ``'beam'`` - beam decoder. beam_size: int, optional (default=5) beam size for beam decoder. Returns ------- result: List[str] """ decoder = self._check_decoder(decoder, beam_size) inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) if decoder == 'greedy': beam_size = 1 encoded_, padded_lens_ = self._sess.run( [self._encoded, self._padded_lens], feed_dict={ self._X_placeholder: padded, self._X_len_placeholder: lens, }, ) padded_lens_ = padded_lens_ // self._time_reduction_factor s = self._sess.run(self._initial_states) results = [] for i in range(len(encoded_)): r = transducer_beam( enc=encoded_[i], total=padded_lens_[i], initial_states=s, encoded_placeholder=self._encoded_placeholder, predicted_placeholder=self._predicted_placeholder, states_placeholder=self._states_placeholder, ytu=self._ytu, new_states=self._new_states, sess=self._sess, beam_width=beam_size, **kwargs, ) results.append(subword_decode(self._vocab, r)) return results
def predict(self, inputs, decoder: str = 'beam', beam_size: int = 100, **kwargs): """ Transcribe inputs, will return list of strings. Parameters ---------- input: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. decoder: str, optional (default='beam') decoder mode, allowed values: * ``'greedy'`` - greedy decoder. * ``'beam'`` - beam decoder. beam_size: int, optional (default=100) beam size for beam decoder. Returns ------- result: List[str] """ decoder = self._check_decoder(decoder, beam_size) inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) if decoder == 'greedy': beam_size = 1 if beam_size != self._beam_size: self._beam_size = beam_size self._decoded = tf.nn.ctc_beam_search_decoder( self._logits, self._seq_lens, beam_width=self._beam_size, top_paths=1, merge_repeated=True, **kwargs, )[0][0] r = self._sess.run(self._decoded, feed_dict={ self._X: padded, self._X_len: lens }) decoded = np.zeros(r.dense_shape, dtype=np.int32) for i in range(r.values.shape[0]): decoded[r.indices[i][0], r.indices[i][1]] = r.values[i] results = [] for i in range(len(decoded)): results.append( char_decode(decoded[i], lookup=self._vocab).replace('<PAD>', '')) return results