def merge_repeated( yseqs, blank=0, ): result = tf.reshape(yseqs[0], [1]) U = shape_util.shape_list(yseqs)[0] i = tf.constant(1, dtype=tf.int32) def _cond(i, result, yseqs, U): return tf.less(i, U) def _body(i, result, yseqs, U): if yseqs[i] != result[-1]: result = tf.concat([result, [yseqs[i]]], axis=-1) return i + 1, result, yseqs, U _, result, _, _ = tf.while_loop( _cond, _body, loop_vars=[i, result, yseqs, U], shape_invariants=( tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), ), ) return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]], constant_values=blank)
def augment(self, spectrogram: tf.Tensor): """ Masking the time channel (shape[0]) Args: spectrogram: shape (T, num_feature_bins, V) Returns: frequency masked spectrogram """ T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32) for _ in range(self.num_masks): t = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) t = tf.minimum( t, tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound, dtype=tf.int32)) t0 = tf.random.uniform([], minval=0, maxval=(T - t), dtype=tf.int32) mask = tf.concat( [ tf.ones([t0, F, V], dtype=spectrogram.dtype), tf.zeros([t, F, V], dtype=spectrogram.dtype), tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype), ], axis=0, ) spectrogram = spectrogram * mask return spectrogram
def augment(self, spectrogram: tf.Tensor): """ Masking the frequency channels (shape[1]) Args: spectrogram: shape (T, num_feature_bins, V) Returns: frequency masked spectrogram """ T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32) for _ in range(self.num_masks): f = tf.random.uniform([], minval=0, maxval=self.mask_factor, dtype=tf.int32) f = tf.minimum(f, F) f0 = tf.random.uniform([], minval=0, maxval=(F - f), dtype=tf.int32) mask = tf.concat( [ tf.ones([T, f0, V], dtype=spectrogram.dtype), tf.zeros([T, f, V], dtype=spectrogram.dtype), tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype), ], axis=1, ) spectrogram = spectrogram * mask return spectrogram
def initialize_beam(dynamic=False): return BeamHypothesis( score=tf.TensorArray( dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, element_shape=tf.TensorShape([]), clear_after_read=False, ), indices=tf.TensorArray( dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, element_shape=tf.TensorShape([]), clear_after_read=False, ), prediction=tf.TensorArray( dtype=tf.int32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, element_shape=None, clear_after_read=False, ), states=tf.TensorArray( dtype=tf.float32, size=beam_width if not dynamic else 0, dynamic_size=dynamic, element_shape=tf.TensorShape(shape_util.shape_list(self.predict_net.get_initial_state())), clear_after_read=False, ), )
def recognize_beam_tflite( self, signal, ): """ Function to convert to tflite using beam search decoding Args: signal: tf.Tensor with shape [None] indicating a single audio signal Return: transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32 """ features = self.speech_featurizer.tf_extract(signal) features = tf.expand_dims(features, axis=0) input_length = shape_util.shape_list(features)[1] input_length = math_util.get_reduced_length(input_length, self.time_reduction_factor) input_length = tf.expand_dims(input_length, axis=0) logits = self.encoder(features, training=False) logits = self.decoder(logits, training=False) probs = tf.nn.softmax(logits) decoded = tf.keras.backend.ctc_decode( y_pred=probs, input_length=input_length, greedy=False, beam_width=self.text_featurizer.decoder_config.beam_width, ) decoded = tf.cast(decoded[0][0][0], dtype=tf.int32) transcript = self.text_featurizer.indices2upoints(decoded) return transcript
def call( self, inputs, **kwargs, ): # inputs shape [B, T, V] _, max_len, dmodel = shape_list(inputs) pe = self.encode(max_len * self.alpha + self.beta, dmodel) return tf.cast(pe, dtype=inputs.dtype)
def call( self, inputs, **kwargs, ): shape = shape_util.shape_list(inputs) outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]]) outputs = tf.reshape( outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor]) return outputs
def recognize( self, inputs: Dict[str, tf.Tensor], ): """ RNN Transducer Greedy decoding Args: features (tf.Tensor): a batch of padded extracted features Returns: tf.Tensor: a batch of decoded transcripts """ batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"]) encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size)) encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
def recognize_beam( self, inputs: Dict[str, tf.Tensor], lm: bool = False, ): """ RNN Transducer Beam Search Args: features (tf.Tensor): a batch of padded extracted features lm (bool, optional): whether to use language model. Defaults to False. Returns: tf.Tensor: a batch of decoded transcripts """ batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"]) encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size)) encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor) return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
def call( self, inputs, training=False, **kwargs, ): outputs = self.ln(inputs, training=training) B, T, E = shape_util.shape_list(outputs) outputs = tf.reshape(outputs, [B, T, 1, E]) outputs = self.pw_conv_1(outputs, training=training) outputs = self.glu(outputs) outputs = self.dw_conv(outputs, training=training) outputs = self.bn(outputs, training=training) outputs = self.swish(outputs) outputs = self.pw_conv_2(outputs, training=training) outputs = tf.reshape(outputs, [B, T, E]) outputs = self.do(outputs, training=training) outputs = self.res_add([inputs, outputs]) return outputs
def fft_weights( nfft, fs, nfilts, width, fmin, fmax, maxlen, ): """ :param nfft: the source FFT size :param sr: sampling rate (Hz) :param nfilts: the number of output bands required (default 64) :param width: the constant width of each band in Bark (default 1) :param fmin: lower limit of frequencies (Hz) :param fmax: upper limit of frequencies (Hz) :param maxlen: number of bins to truncate the rows to :return: a tuple `weights`, `gain` with the calculated weight matrices and gain vectors Generate a matrix of weights to combine FFT bins into Gammatone bins. Note about `maxlen` parameter: While wts has nfft columns, the second half are all zero. Hence, aud spectrum is:: fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft)) `maxlen` truncates the rows to this many bins. | (c) 2004-2009 Dan Ellis [email protected] based on rastamat/audspec.m | (c) 2012 Jason Heeris (Python implementation) """ ucirc = tf.exp(1j * 2 * pi * tf.cast(tf.range(0, nfft / 2 + 1), tf.complex64) / nfft)[None, ...] # Common ERB filter code factored out cf_array = erb_space(fmin, fmax, nfilts)[::-1] erb_filers = make_erb_filters(fs, cf_array, width) A11 = erb_filers[1] A12 = erb_filers[2] A13 = erb_filers[3] A14 = erb_filers[4] B2 = erb_filers[8] gain = erb_filers[9] # _, A11, A12, A13, A14, _, _, _, B2, gain = A11, A12, A13, A14 = A11[..., None], A12[..., None], A13[..., None], A14[..., None] r = tf.cast(tf.sqrt(B2), tf.complex64) theta = 2 * pi * cf_array / fs pole = (r * tf.exp(1j * theta))[..., None] GTord = 4 weights = ( tf.abs(ucirc + A11 * fs) * tf.abs(ucirc + A12 * fs) * tf.abs(ucirc + A13 * fs) * tf.abs(ucirc + A14 * fs) * tf.abs(fs * (pole - ucirc) * (tf.math.conj(pole) - ucirc)) ** (-GTord) / tf.cast(gain[..., None], tf.float32) ) weights = tf.pad(weights, [[0, 0], [0, nfft - shape_list(weights)[-1]]]) weights = weights[:, 0 : int(maxlen)] return tf.transpose(weights, perm=[1, 0])
def merge_two_last_dims(x): b, _, f, c = shape_util.shape_list(x) return tf.reshape(x, shape=[b, -1, f * c])