コード例 #1
0
def merge_repeated(
    yseqs,
    blank=0,
):
    result = tf.reshape(yseqs[0], [1])

    U = shape_util.shape_list(yseqs)[0]
    i = tf.constant(1, dtype=tf.int32)

    def _cond(i, result, yseqs, U):
        return tf.less(i, U)

    def _body(i, result, yseqs, U):
        if yseqs[i] != result[-1]:
            result = tf.concat([result, [yseqs[i]]], axis=-1)
        return i + 1, result, yseqs, U

    _, result, _, _ = tf.while_loop(
        _cond,
        _body,
        loop_vars=[i, result, yseqs, U],
        shape_invariants=(
            tf.TensorShape([]),
            tf.TensorShape([None]),
            tf.TensorShape([None]),
            tf.TensorShape([]),
        ),
    )

    return tf.pad(result, [[U - shape_util.shape_list(result)[0], 0]],
                  constant_values=blank)
コード例 #2
0
 def augment(self, spectrogram: tf.Tensor):
     """
     Masking the time channel (shape[0])
     Args:
         spectrogram: shape (T, num_feature_bins, V)
     Returns:
         frequency masked spectrogram
     """
     T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32)
     for _ in range(self.num_masks):
         t = tf.random.uniform([],
                               minval=0,
                               maxval=self.mask_factor,
                               dtype=tf.int32)
         t = tf.minimum(
             t,
             tf.cast(tf.cast(T, dtype=tf.float32) * self.p_upperbound,
                     dtype=tf.int32))
         t0 = tf.random.uniform([],
                                minval=0,
                                maxval=(T - t),
                                dtype=tf.int32)
         mask = tf.concat(
             [
                 tf.ones([t0, F, V], dtype=spectrogram.dtype),
                 tf.zeros([t, F, V], dtype=spectrogram.dtype),
                 tf.ones([T - t0 - t, F, V], dtype=spectrogram.dtype),
             ],
             axis=0,
         )
         spectrogram = spectrogram * mask
     return spectrogram
コード例 #3
0
 def augment(self, spectrogram: tf.Tensor):
     """
     Masking the frequency channels (shape[1])
     Args:
         spectrogram: shape (T, num_feature_bins, V)
     Returns:
         frequency masked spectrogram
     """
     T, F, V = shape_util.shape_list(spectrogram, out_type=tf.int32)
     for _ in range(self.num_masks):
         f = tf.random.uniform([],
                               minval=0,
                               maxval=self.mask_factor,
                               dtype=tf.int32)
         f = tf.minimum(f, F)
         f0 = tf.random.uniform([],
                                minval=0,
                                maxval=(F - f),
                                dtype=tf.int32)
         mask = tf.concat(
             [
                 tf.ones([T, f0, V], dtype=spectrogram.dtype),
                 tf.zeros([T, f, V], dtype=spectrogram.dtype),
                 tf.ones([T, F - f0 - f, V], dtype=spectrogram.dtype),
             ],
             axis=1,
         )
         spectrogram = spectrogram * mask
     return spectrogram
コード例 #4
0
 def initialize_beam(dynamic=False):
     return BeamHypothesis(
         score=tf.TensorArray(
             dtype=tf.float32,
             size=beam_width if not dynamic else 0,
             dynamic_size=dynamic,
             element_shape=tf.TensorShape([]),
             clear_after_read=False,
         ),
         indices=tf.TensorArray(
             dtype=tf.int32,
             size=beam_width if not dynamic else 0,
             dynamic_size=dynamic,
             element_shape=tf.TensorShape([]),
             clear_after_read=False,
         ),
         prediction=tf.TensorArray(
             dtype=tf.int32,
             size=beam_width if not dynamic else 0,
             dynamic_size=dynamic,
             element_shape=None,
             clear_after_read=False,
         ),
         states=tf.TensorArray(
             dtype=tf.float32,
             size=beam_width if not dynamic else 0,
             dynamic_size=dynamic,
             element_shape=tf.TensorShape(shape_util.shape_list(self.predict_net.get_initial_state())),
             clear_after_read=False,
         ),
     )
コード例 #5
0
    def recognize_beam_tflite(
        self,
        signal,
    ):
        """
        Function to convert to tflite using beam search decoding
        Args:
            signal: tf.Tensor with shape [None] indicating a single audio signal

        Return:
            transcript: tf.Tensor of Unicode Code Points with shape [None] and dtype tf.int32
        """
        features = self.speech_featurizer.tf_extract(signal)
        features = tf.expand_dims(features, axis=0)
        input_length = shape_util.shape_list(features)[1]
        input_length = math_util.get_reduced_length(input_length,
                                                    self.time_reduction_factor)
        input_length = tf.expand_dims(input_length, axis=0)
        logits = self.encoder(features, training=False)
        logits = self.decoder(logits, training=False)
        probs = tf.nn.softmax(logits)
        decoded = tf.keras.backend.ctc_decode(
            y_pred=probs,
            input_length=input_length,
            greedy=False,
            beam_width=self.text_featurizer.decoder_config.beam_width,
        )
        decoded = tf.cast(decoded[0][0][0], dtype=tf.int32)
        transcript = self.text_featurizer.indices2upoints(decoded)
        return transcript
コード例 #6
0
 def call(
     self,
     inputs,
     **kwargs,
 ):
     # inputs shape [B, T, V]
     _, max_len, dmodel = shape_list(inputs)
     pe = self.encode(max_len * self.alpha + self.beta, dmodel)
     return tf.cast(pe, dtype=inputs.dtype)
コード例 #7
0
 def call(
     self,
     inputs,
     **kwargs,
 ):
     shape = shape_util.shape_list(inputs)
     outputs = tf.pad(inputs, [[0, 0], [0, self.padding(shape[1])], [0, 0]])
     outputs = tf.reshape(
         outputs, [shape[0], -1, shape[-1] * self.time_reduction_factor])
     return outputs
コード例 #8
0
    def recognize(
        self,
        inputs: Dict[str, tf.Tensor],
    ):
        """
        RNN Transducer Greedy decoding
        Args:
            features (tf.Tensor): a batch of padded extracted features

        Returns:
            tf.Tensor: a batch of decoded transcripts
        """
        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
        return self._perform_greedy_batch(encoded=encoded, encoded_length=encoded_length)
コード例 #9
0
    def recognize_beam(
        self,
        inputs: Dict[str, tf.Tensor],
        lm: bool = False,
    ):
        """
        RNN Transducer Beam Search
        Args:
            features (tf.Tensor): a batch of padded extracted features
            lm (bool, optional): whether to use language model. Defaults to False.

        Returns:
            tf.Tensor: a batch of decoded transcripts
        """
        batch_size, _, _, _ = shape_util.shape_list(inputs["inputs"])
        encoded, _ = self.encoder.recognize(inputs["inputs"], self.encoder.get_initial_state(batch_size))
        encoded_length = math_util.get_reduced_length(inputs["inputs_length"], self.time_reduction_factor)
        return self._perform_beam_search_batch(encoded=encoded, encoded_length=encoded_length, lm=lm)
コード例 #10
0
 def call(
     self,
     inputs,
     training=False,
     **kwargs,
 ):
     outputs = self.ln(inputs, training=training)
     B, T, E = shape_util.shape_list(outputs)
     outputs = tf.reshape(outputs, [B, T, 1, E])
     outputs = self.pw_conv_1(outputs, training=training)
     outputs = self.glu(outputs)
     outputs = self.dw_conv(outputs, training=training)
     outputs = self.bn(outputs, training=training)
     outputs = self.swish(outputs)
     outputs = self.pw_conv_2(outputs, training=training)
     outputs = tf.reshape(outputs, [B, T, E])
     outputs = self.do(outputs, training=training)
     outputs = self.res_add([inputs, outputs])
     return outputs
コード例 #11
0
def fft_weights(
    nfft,
    fs,
    nfilts,
    width,
    fmin,
    fmax,
    maxlen,
):
    """
    :param nfft: the source FFT size
    :param sr: sampling rate (Hz)
    :param nfilts: the number of output bands required (default 64)
    :param width: the constant width of each band in Bark (default 1)
    :param fmin: lower limit of frequencies (Hz)
    :param fmax: upper limit of frequencies (Hz)
    :param maxlen: number of bins to truncate the rows to

    :return: a tuple `weights`, `gain` with the calculated weight matrices and
             gain vectors

    Generate a matrix of weights to combine FFT bins into Gammatone bins.

    Note about `maxlen` parameter: While wts has nfft columns, the second half
    are all zero. Hence, aud spectrum is::

        fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft))

    `maxlen` truncates the rows to this many bins.

    | (c) 2004-2009 Dan Ellis [email protected]  based on rastamat/audspec.m
    | (c) 2012 Jason Heeris (Python implementation)
    """
    ucirc = tf.exp(1j * 2 * pi * tf.cast(tf.range(0, nfft / 2 + 1), tf.complex64) / nfft)[None, ...]

    # Common ERB filter code factored out
    cf_array = erb_space(fmin, fmax, nfilts)[::-1]

    erb_filers = make_erb_filters(fs, cf_array, width)
    A11 = erb_filers[1]
    A12 = erb_filers[2]
    A13 = erb_filers[3]
    A14 = erb_filers[4]
    B2 = erb_filers[8]
    gain = erb_filers[9]
    # _, A11, A12, A13, A14, _, _, _, B2, gain =

    A11, A12, A13, A14 = A11[..., None], A12[..., None], A13[..., None], A14[..., None]

    r = tf.cast(tf.sqrt(B2), tf.complex64)
    theta = 2 * pi * cf_array / fs
    pole = (r * tf.exp(1j * theta))[..., None]

    GTord = 4

    weights = (
        tf.abs(ucirc + A11 * fs)
        * tf.abs(ucirc + A12 * fs)
        * tf.abs(ucirc + A13 * fs)
        * tf.abs(ucirc + A14 * fs)
        * tf.abs(fs * (pole - ucirc) * (tf.math.conj(pole) - ucirc)) ** (-GTord)
        / tf.cast(gain[..., None], tf.float32)
    )

    weights = tf.pad(weights, [[0, 0], [0, nfft - shape_list(weights)[-1]]])

    weights = weights[:, 0 : int(maxlen)]

    return tf.transpose(weights, perm=[1, 0])
コード例 #12
0
def merge_two_last_dims(x):
    b, _, f, c = shape_util.shape_list(x)
    return tf.reshape(x, shape=[b, -1, f * c])