Example #1
0
def _MakeLogMelFromTensorflowBuiltin(tf_wav_bytes):
    sample_rate, audio = audio_lib.DecodeWav(tf_wav_bytes)
    static_sample_rate = 16000
    with tf.control_dependencies(
        [tf.assert_equal(sample_rate, static_sample_rate)]):
        log_mel = audio_lib.AudioToMfcc(static_sample_rate, audio, 25, 25, 40)
    return log_mel
Example #2
0
  def FinalizeImage(self):
    """Finishes creation of the overall figure, returning the image tensor."""
    subplot_grid_shape = self._subplot_grid_shape
    if subplot_grid_shape is None:
      subplot_grid_shape = (len(self._subplots), 1)

    # AddMatplotlibFigureSummary (due to restrictions of py_func) only supports
    # flattened list of tensors so we must do some bookkeeping to maintain a
    # mapping from _SubplotMetadata object to flattened_tensors.
    subplot_slices = []
    flattened_tensors = []
    for subplot in self._subplots:
      start = len(flattened_tensors)
      subplot_slices.append((start, start + len(subplot.tensor_list)))
      flattened_tensors.extend(subplot.tensor_list)

    def PlotFunc(fig, *numpy_data_list):
      gs = gridspec.GridSpec(*subplot_grid_shape, **self._gridspec_kwargs)
      for n, subplot in enumerate(self._subplots):
        axes = fig.add_subplot(gs[n])
        start, end = subplot_slices[n]
        subplot_data = numpy_data_list[start:end]
        subplot.plot_func(fig, axes, *subplot_data)

    func = functools.partial(_RenderMatplotlibFigures, self._figsize,
                             self._max_outputs, PlotFunc)
    batch_sizes = [tf.shape(t)[0] for t in flattened_tensors]
    num_tensors = len(flattened_tensors)
    with tf.control_dependencies([
        tf.assert_equal(
            batch_sizes, [batch_sizes[0]] * num_tensors, summarize=num_tensors)
    ]):
      return tf.py_func(
          func, flattened_tensors, tf.uint8, name='RenderMatplotlibFigures')
    def _check_paddings(self, paddings):
        with tf.name_scope('check_paddings'):
            unpacked_paddings = tf.unstack(paddings)

            non_decr = []
            for t in unpacked_paddings:
                non_d = tf.is_non_decreasing(t)
                non_decr.append(non_d)
            all_non_decr = tf.stack(non_decr)

            paddings = py_utils.with_dependencies([
                tf.assert_equal(tf.reduce_any(tf.equal(paddings, 0.0)),
                                True,
                                message='must have at least one zero value.'),
                tf.assert_equal(
                    all_non_decr, True, message='must be non-decreasing')
            ], paddings)
            return paddings
Example #4
0
    def _ReshapeToMono2D(self, pcm_audio_data, paddings):
        """Reshapes a 3D or 4D input to 2D.

    Since the input to FProp can be 3D or 4D (see class comments), this will
    collapse it back to a 2D, mono shape for internal processing.

    Args:
      pcm_audio_data: 2D, 3D or 4D audio input. See class comments. Must have a
        rank.
      paddings: Original paddings shaped to the first two dims of
        pcm_audio_data.

    Returns:
      Tuple of 2D [batch_size, timestep] mono audio data, new paddings.
    """
        shape = py_utils.GetShape(pcm_audio_data)
        rank = len(shape)
        if rank == 2:
            return pcm_audio_data, paddings
        elif rank == 3:
            # [batch, time, channel]
            with tf.control_dependencies([tf.assert_equal(shape[2], 1)]):
                return tf.squeeze(pcm_audio_data, axis=2), paddings
        elif rank == 4:
            # [batch, time, packet, channel]
            batch_size, orig_time, orig_packet_size, channel = shape
            time = orig_time * orig_packet_size
            with tf.control_dependencies([tf.assert_equal(channel, 1)]):
                pcm_audio_data = tf.reshape(pcm_audio_data, (batch_size, time))
                # Transform paddings into the new time base with a padding per time
                # step vs per packet by duplicating each packet.
                paddings = tf.reshape(
                    tf.tile(tf.expand_dims(paddings, axis=2),
                            [1, 1, orig_packet_size]), (batch_size, time))
                return pcm_audio_data, paddings
        else:
            raise ValueError('Illegal pcm_audio_data shape')
Example #5
0
def ExtractLogMelFeatures(wav_bytes_t):
    """Create Log-Mel Filterbank Features from raw bytes.

  Args:
    wav_bytes_t: Tensor representing raw wav file as a string of bytes. It is
      currently assumed that the wav file is encoded at 16KHz (see DecodeWav,
      below).

  Returns:
    A Tensor representing three stacked log-Mel filterbank energies, sub-sampled
    every three frames.
  """

    # We want to use these parameters exactly.
    def _CreateAsrFrontend():
        """Parameters corresponding to default ASR frontend."""
        p = asr_frontend.MelAsrFrontend.Params()
        p.sample_rate = 16000.
        p.frame_size_ms = 25.
        p.frame_step_ms = 10.
        p.num_bins = 80
        p.lower_edge_hertz = 125.
        p.upper_edge_hertz = 7600.
        p.preemph = 0.97
        p.noise_scale = 0.
        p.pad_end = False
        return p.Instantiate()

    sample_rate, audio = DecodeWav(wav_bytes_t)
    audio *= 32768
    # Remove channel dimension, since we have a single channel.
    audio = tf.squeeze(audio, axis=1)
    # TODO(drpng): make batches.
    audio = tf.expand_dims(audio, axis=0)
    static_sample_rate = 16000
    mel_frontend = _CreateAsrFrontend()
    with tf.control_dependencies(
        [tf.assert_equal(sample_rate, static_sample_rate)]):
        outputs = mel_frontend.FPropDefaultTheta(
            py_utils.NestedMap(src_inputs=audio,
                               paddings=tf.zeros_like(audio)))
        log_mel = outputs.src_inputs
    return log_mel