Exemple #1
0
 def update_state(self, numerator_denominator):
     numerator = numerator_denominator[0]
     denominator = numerator_denominator[1]
     self.numerator.assign_add(
         tf.reduce_sum(tf.cast(numerator, dtype=tf.float32)))
     self.denominator.assign_add(
         tf.reduce_sum(tf.cast(denominator, dtype=tf.float32)))
Exemple #2
0
def get_logits_size(features, features_size, logits):
    time_reduction = tf.cast(tf.shape(features)[1],
                             dtype=tf.float32) / tf.cast(tf.shape(logits)[1],
                                                         dtype=tf.float32)
    logits_size = tf.cast(tf.cast(features_size, dtype=tf.float32) /
                          time_reduction,
                          dtype=features_size.dtype)

    return logits_size
Exemple #3
0
def get_normalized_ctc_loss_without_reduce(*, logits_transposed, logits_size,
                                           encodeds, encodeds_size):
    ctc_loss_without_reduce = tf.nn.ctc_loss(
        labels=encodeds,
        logits=logits_transposed,
        label_length=encodeds_size,
        logit_length=logits_size,
        logits_time_major=True,
        blank_index=0,
    )

    # tf.nn.ctc_loss returns a tensor of shape [batch_size] with negative log
    # probabilities, but each probability may have been computed with an
    # argument with different length (which turn into sums, each with different
    # number of summands in the case of independence). For this reason we
    # divide each negative log probability by the logits_size
    # replacing "logits_size" with "logits_size + 1" to avoid division by zero
    ctc_loss_without_reduce /= tf.cast(logits_size + 1,
                                       ctc_loss_without_reduce.dtype)

    ctc_loss_without_reduce = tf.debugging.check_numerics(
        tensor=ctc_loss_without_reduce,
        message="nan or inf in ctc_loss",
        name="ctc_loss_without_reduce",
    )

    return ctc_loss_without_reduce
    def fn(x):
        y = table.lookup(x)
        z = tf.concat(
            [
                tf.boolean_mask(y, y > 0),
                tf.zeros(tf.reduce_sum(tf.cast(y <= 0, dtype=tf.int32)),
                         dtype=y.dtype),
            ],
            axis=0,
        )

        return z
Exemple #5
0
def get_logits_encodeds(
    *,
    logits_transposed,
    logits_size,
    greedy_decoder,
    beam_width,
):
    # Unlike tf.nn.ctc_loss, the functions
    # tf.nn.ctc_greedy_decoder and tf.nn.ctc_beam_search_decoder don't have
    # a parameter to signal which is the blank_index. In fact, in the
    # tf.nn.ctc_greedy_decoder the documentation mentions that blank index
    # (num_classes - 1)

    # To account for the fact that the text encoder
    # https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TextEncoder
    # encodes to the range [1,
    # vocab_size), and we took advantage of that by setting blank_index=0
    # in the get_normalized_ctc_loss, we now roll the logits_transposed
    # with shift=-1, axis=-1, so that the blank_index is moved from the
    # 0-th position to the last
    logits_transposed = roll(logits_transposed)

    if greedy_decoder:
        logits_encodeds, _ = tf.nn.ctc_greedy_decoder(
            inputs=logits_transposed,
            sequence_length=logits_size,
            merge_repeated=True,
        )
    else:
        logits_encodeds, _ = tf.nn.ctc_beam_search_decoder(
            inputs=logits_transposed,
            sequence_length=logits_size,
            beam_width=beam_width,
            top_paths=1,
        )
    logits_encodeds = logits_encodeds[0]

    # Given that the text encoder
    # https://www.tensorflow.org/datasets/api_docs/python/tfds/features/text/TextEncoder
    # encodes to and decodes from the range [1, vocab_size), we shift the
    # output of the ctc decoder which is in the range [0, vocab_size - 1)
    # to the correct range [1, vocab_size) by adding one each index
    logits_encodeds = tf.sparse.SparseTensor(
        indices=logits_encodeds.indices,
        values=logits_encodeds.values + 1,
        dense_shape=logits_encodeds.dense_shape,
    )

    logits_encodeds = tf.sparse.to_dense(logits_encodeds)
    logits_encodeds = tf.cast(logits_encodeds, tf.int32)

    return logits_encodeds
def spec2wav(magnitude_spectrogram,
             phase,
             sample_rate,
             nfft=None,
             ref_level_db=20,
             min_level_db=-100,
             window_len_in_sec=0.025,
             step_len_in_sec=0.010,
             exponent=2.0):
    """
    Computes the audio pcm from magnitude spectrogram and phase

    Parameters:
    -----------
    magnitude_spectrogram:  Magnitude spectogram of audio pcm
    phase:                  Phase obtained from stfts of audio pcm
    sample_rate:            Samling frequency of the recorded audio
    ref_level_db:           Ref db level required [defaul 20]
    min_level_db:           Minimum db level required [default -100]
    window_len_in_sec:      float, in seconds
    step_len_in_sec:        float, in seconds
    exponent:               Int, 1 for energy and 2 for power [default 2]
    """

    magnitude_spectrogram = tf.clip_by_value(magnitude_spectrogram,
                                             clip_value_min=0.0,
                                             clip_value_max=1.0)
    magnitude_spectrogram = (magnitude_spectrogram - 1.0) * -min_level_db
    magnitude_spectrogram += ref_level_db
    magnitude_spectrogram = tf.math.pow(
        tf.constant(10.0), magnitude_spectrogram / (exponent * 10))
    magnitude_spectrogram = tf.cast(magnitude_spectrogram, dtype=tf.complex64)

    phase = tf.complex(tf.zeros(tf.shape(phase)), phase)
    phase = tf.math.exp(phase)
    stfts = magnitude_spectrogram * phase

    # Estimating parameters for STFT
    frame_length_in_sample = int(window_len_in_sec * sample_rate)
    frame_step_in_sample = int(step_len_in_sec * sample_rate)
    if nfft is None:
        nfft = frame_length_in_sample

    W = tf.signal.inverse_stft(stfts=stfts,
                               frame_length=frame_length_in_sample,
                               frame_step=frame_step_in_sample,
                               fft_length=nfft,
                               window_fn=tf.signal.inverse_stft_window_fn(
                                   frame_step=frame_step_in_sample,
                                   forward_window_fn=tf.signal.hann_window))
    return W
def get_stft(audio_pcm,
             normalize=False,
             fft_length=512,
             window_len=None,
             step_len=None,
             center=True,
             verbose=0):
    """
    Performs short time fourier transformation of a time domain audio signal

    Parameters
    ----------
    audio_pcm :     A 1D tensor (float32) holding the input audio
    fft_length :    (int in samples) length of the windowed signal after padding,
                    which will be used to extract FFT
    window_len :    (int > 0 and <= fft_length) length of each audio frame in samples [default: fft_length]
    step_len :      (int > 0) length of hop / stride in samples [default: window_length // 4]
    center :        (Bool) Type of padding to be used to match librosa
    verbose :       Verbosity level, 0 = no ouput, > 0 debug prints

    This function returns a complex-valued matrix stfts
    """

    # Checking the input type and perform casting if necessary
    if audio_pcm.dtype != 'float32':
        audio_pcm = tf.cast(audio_pcm, tf.float32)

    # Performing audio normalization
    if normalize:
        audio_pcm = normalize_audio_full_scale(audio_pcm)

    if window_len is None:
        window_len = fft_length

    if step_len is None:
        step_len = int(window_len // 4)

    # Perform padding of the original signal
    if center:
        pad_amount = int(window_len // 2)  # As used by Librosa

        if verbose > 0:
            print(
                f'[INFO] (audio_feature.get_stft)] pad_amount = {pad_amount}')

        audio_pcm = tf.pad(audio_pcm, [[pad_amount, pad_amount]], 'REFLECT')

    # Extracting frames from sudio signal
    frames = tf.signal.frame(audio_pcm, window_len, step_len, pad_end=False)

    if verbose > 0:
        print(
            f'[INFO] (audio_feature.get_stft)] frames.shape = {frames.shape}')

    # Generating hanning window
    fft_window = tf.signal.hann_window(window_len, periodic=True)

    # Computing the spectrogram, the output is an array of complex number
    stfts = tf.signal.rfft(frames * fft_window, fft_length=[fft_length])

    return stfts