def test_beam_decoders(self): ''' Test on random data that custom decoder outputs the same transcript as standard TF beam search decoder ''' seq = np.random.uniform(size=self.seq.shape).astype(np.float32) logits = tf.constant(seq) seq_len = tf.constant([self.seq.shape[0]]) beam_search_decoded = tf.nn.ctc_beam_search_decoder(logits, seq_len, beam_width=self.beam_width, top_paths=1, merge_repeated=False) with tf.Session() as sess: res_beam = sess.run(beam_search_decoded) decoded_beam, prob_beam = res_beam prob1 = prob_beam[0][0] decoded_text1 = ''.join([self.vocab[c] for c in decoded_beam[0].values]) res = ctc_beam_search_decoder(softmax(seq.squeeze()), self.vocab[:-1], beam_size=self.beam_width) prob2, decoded_text2 = res[0] if tf.__version__ >= '1.11': # works for newer versions only (with CTC decoder fix) self.assertTrue( abs(prob1 - prob2) < self.tol ) self.assertTrue( prob2 < 0 ) self.assertTrue( decoded_text1 == decoded_text2 )
def perform_beam_search(self, probs: np.ndarray, lm: bool = False): decoded = ctc_beam_search_decoder( probs_seq=probs, vocabulary=self.text_featurizer.vocab_array, beam_size=self.text_featurizer.decoder_config["beam_width"], ext_scoring_func=self.text_featurizer.scorer if lm else None) decoded = decoded[0][-1] return tf.convert_to_tensor(decoded, dtype=tf.string)
def predict_lm(self, inputs, lm, beam_size: int = 100, **kwargs): """ Transcribe inputs using Beam Search + LM, will return list of strings. This method will not able to utilise batch decoding, instead will do loop to decode for each elements. Parameters ---------- input: List[np.array] List[np.array] or List[malaya_speech.model.frame.Frame]. lm: ctc_decoders.Scorer Returned from `malaya_speech.stt.language_model()`. beam_size: int, optional (default=100) beam size for beam decoder. Returns ------- result: List[str] """ try: from ctc_decoders import ctc_beam_search_decoder except: raise ModuleNotFoundError( 'ctc_decoders not installed. Please install it by `pip install ctc-decoders` and try again.' ) inputs = [ input.array if isinstance(input, Frame) else input for input in inputs ] padded, lens = sequence_1d(inputs, return_len=True) logits, seq_lens = self._sess.run( [self._softmax, self._seq_lens], feed_dict={ self._X: padded, self._X_len: lens }, ) logits = np.transpose(logits, axes=(1, 0, 2)) results = [] for i in range(len(logits)): d = ctc_beam_search_decoder( logits[i][:seq_lens[i]], self._vocab, beam_size, ext_scoring_func=lm, **kwargs, ) results.append(d[0][1]) return results
def _perform_beam_search( self, probs: np.ndarray, lm: bool = False, ): from ctc_decoders import ctc_beam_search_decoder decoded = ctc_beam_search_decoder( probs_seq=probs, vocabulary=self.text_featurizer.non_blank_tokens, beam_size=self.text_featurizer.decoder_config.beam_width, ext_scoring_func=self.text_featurizer.scorer if lm else None, ) decoded = decoded[0][-1] return tf.convert_to_tensor(decoded, dtype=tf.string)
def test_decoders(self): ''' Test all CTC decoders on a sample transcript ('ten seconds'). Standard TF decoders should output 'then seconds'. Custom CTC decoder with LM rescoring should yield 'ten seconds'. ''' logits = tf.constant(self.seq) seq_len = tf.constant([self.seq.shape[0]]) greedy_decoded = tf.nn.ctc_greedy_decoder(logits, seq_len, merge_repeated=True) beam_search_decoded = tf.nn.ctc_beam_search_decoder(logits, seq_len, beam_width=self.beam_width, top_paths=1, merge_repeated=False) with tf.Session() as sess: res_greedy, res_beam = sess.run([greedy_decoded, beam_search_decoded]) decoded_greedy, prob_greedy = res_greedy decoded_text = ''.join([self.vocab[c] for c in decoded_greedy[0].values]) self.assertTrue( abs(7079.117 + prob_greedy[0][0]) < self.tol ) self.assertTrue( decoded_text == 'then seconds' ) decoded_beam, prob_beam = res_beam decoded_text = ''.join([self.vocab[c] for c in decoded_beam[0].values]) if tf.__version__ >= '1.11': # works for newer versions only (with CTC decoder fix) self.assertTrue( abs(1.1842 + prob_beam[0][0]) < self.tol ) self.assertTrue( decoded_text == 'then seconds' ) scorer = Scorer(alpha=2.0, beta=0.5, model_path='ctc_decoder_with_lm/ctc-test-lm.binary', vocabulary=self.vocab[:-1]) res = ctc_beam_search_decoder(softmax(self.seq.squeeze()), self.vocab[:-1], beam_size=self.beam_width, ext_scoring_func=scorer) res_prob, decoded_text = res[0] self.assertTrue( abs(4.0845 + res_prob) < self.tol ) self.assertTrue( decoded_text == self.label )
0.04139363, ], [ 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, 0.05294827, 0.22298418, ], ] greedy_result = ["ac'bdc", "b'da"] beam_search_result = ['acdc', "b'a"] ctc_greedy_decoder(np.array(probs_seq1), vocab_list) == greedy_result[0] ctc_greedy_decoder(np.array(probs_seq2), vocab_list) == greedy_result[1] ctc_beam_search_decoder( probs_seq=np.array(probs_seq1), beam_size=beam_size, vocabulary=vocab_list, ) ctc_beam_search_decoder( probs_seq=np.array(probs_seq2), beam_size=beam_size, vocabulary=vocab_list, )