Beispiel #1
0
    def test_basic_mono(self):
        signal = np.arange(6)
        frame_length = 3
        frame_step = 2

        with self.session(use_gpu=True):
            for rank in range(5):
                nd_signal = np.reshape(signal, (1, ) * rank + signal.shape)

                # With padding, we pad the last frame with pad_value.
                result = shape_ops.frame(nd_signal,
                                         frame_length,
                                         frame_step,
                                         pad_end=True,
                                         pad_value=99).eval()
                expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4],
                                                  [4, 5, 99]])
                expected = np.reshape(expected_inner_frames, (1, ) * rank +
                                      expected_inner_frames.shape)
                self.assertAllEqual(expected, result)

                # Without padding, we drop the last frame.
                expected_inner_frames = np.array([[0, 1, 2], [2, 3, 4]])
                expected = np.reshape(expected_inner_frames, (1, ) * rank +
                                      expected_inner_frames.shape)
                result = shape_ops.frame(nd_signal,
                                         frame_length,
                                         frame_step,
                                         pad_end=False).eval()
                self.assertAllEqual(expected, result)
Beispiel #2
0
    def test_axis(self):
        signal = np.reshape(np.arange(16), (2, 4, 2))
        result = shape_ops.frame(signal,
                                 frame_length=2,
                                 frame_step=2,
                                 pad_end=True,
                                 axis=1)
        expected = np.reshape(np.arange(16), (2, 2, 2, 2))
        self.assertAllEqual(expected, self.evaluate(result))

        result = shape_ops.frame(signal,
                                 frame_length=2,
                                 frame_step=1,
                                 pad_end=True,
                                 axis=1)
        expected = [[[[0, 1], [2, 3]], [[2, 3], [4, 5]], [[4, 5], [6, 7]],
                     [[6, 7], [0, 0]]],
                    [[[8, 9], [10, 11]], [[10, 11], [12, 13]],
                     [[12, 13], [14, 15]], [[14, 15], [0, 0]]]]
        self.assertAllEqual(expected, self.evaluate(result))

        result = shape_ops.frame(signal,
                                 frame_length=3,
                                 frame_step=1,
                                 pad_end=True,
                                 axis=1)
        expected = [[[[0, 1], [2, 3], [4, 5]], [[2, 3], [4, 5], [6, 7]],
                     [[4, 5], [6, 7], [0, 0]], [[6, 7], [0, 0], [0, 0]]],
                    [[[8, 9], [10, 11], [12, 13]],
                     [[10, 11], [12, 13], [14, 15]],
                     [[12, 13], [14, 15], [0, 0]], [[14, 15], [0, 0], [0, 0]]]]
        self.assertAllEqual(expected, self.evaluate(result))
  def test_axis(self):
    signal = np.reshape(np.arange(16), (2, 4, 2))
    with self.session(use_gpu=True):
      result = shape_ops.frame(signal, frame_length=2, frame_step=2,
                               pad_end=True, axis=1)
      expected = np.reshape(np.arange(16), (2, 2, 2, 2))
      self.assertAllEqual(expected, self.evaluate(result))

      result = shape_ops.frame(signal, frame_length=2, frame_step=1,
                               pad_end=True, axis=1)
      expected = [[[[0, 1], [2, 3]],
                   [[2, 3], [4, 5]],
                   [[4, 5], [6, 7]],
                   [[6, 7], [0, 0]]],
                  [[[8, 9], [10, 11]],
                   [[10, 11], [12, 13]],
                   [[12, 13], [14, 15]],
                   [[14, 15], [0, 0]]]]
      self.assertAllEqual(expected, self.evaluate(result))

      result = shape_ops.frame(signal, frame_length=3, frame_step=1,
                               pad_end=True, axis=1)
      expected = [[[[0, 1], [2, 3], [4, 5]],
                   [[2, 3], [4, 5], [6, 7]],
                   [[4, 5], [6, 7], [0, 0]],
                   [[6, 7], [0, 0], [0, 0]]],
                  [[[8, 9], [10, 11], [12, 13]],
                   [[10, 11], [12, 13], [14, 15]],
                   [[12, 13], [14, 15], [0, 0]],
                   [[14, 15], [0, 0], [0, 0]]]]
      self.assertAllEqual(expected, self.evaluate(result))
Beispiel #4
0
  def test_complex_shape(self):
    signal = np.vstack([np.arange(6),
                        np.arange(6) + 10,
                        np.arange(6) + 20,
                        np.arange(6) + 30,
                        np.arange(6) + 40,
                        np.arange(6) + 50])
    signal = np.reshape(signal, (2, 1, 3, 1, 6))
    frame_length = 3
    frame_step = 2

    # With padding, we pad the last frame with pad_value.
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99)
    # Resulting shape is (2, 1, 3, 1, 3, 3).
    expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]],
                  [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]],
                  [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]],
                [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]],
                  [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]],
                  [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]]
    self.assertAllEqual(expected, result)

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False)
    # Resulting shape is (2, 1, 3, 1, 3, 2).
    expected = [[[[[[0, 1, 2], [2, 3, 4]]],
                  [[[10, 11, 12], [12, 13, 14]]],
                  [[[20, 21, 22], [22, 23, 24]]]]],
                [[[[[30, 31, 32], [32, 33, 34]]],
                  [[[40, 41, 42], [42, 43, 44]]],
                  [[[50, 51, 52], [52, 53, 54]]]]]]
    self.assertAllEqual(expected, result)
  def test_basic_stereo(self):
    signal = np.vstack([np.arange(6),
                        np.arange(6) + 10])
    frame_length = 3
    frame_step = 2

    with self.session(use_gpu=True):
      for rank in range(5):
        nd_signal = np.reshape(signal, (1,) * rank + signal.shape)

        # With padding, we pad the last frame with pad_value.
        result = shape_ops.frame(nd_signal, frame_length, frame_step,
                                 pad_end=True, pad_value=99).eval()
        expected_inner_frames = np.array([
            [[0, 1, 2], [2, 3, 4], [4, 5, 99]],
            [[10, 11, 12], [12, 13, 14], [14, 15, 99]]])
        expected = np.reshape(
            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
        self.assertAllEqual(expected, result)

        # Without padding, we drop the last frame.
        expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]],
                                          [[10, 11, 12], [12, 13, 14]]])
        expected = np.reshape(
            expected_inner_frames, (1,) * rank + expected_inner_frames.shape)
        result = shape_ops.frame(nd_signal, frame_length, frame_step,
                                 pad_end=False).eval()
        self.assertAllEqual(expected, result)
  def test_complex_shape(self):
    signal = np.vstack([np.arange(6),
                        np.arange(6) + 10,
                        np.arange(6) + 20,
                        np.arange(6) + 30,
                        np.arange(6) + 40,
                        np.arange(6) + 50])
    signal = np.reshape(signal, (2, 1, 3, 1, 6))
    frame_length = 3
    frame_step = 2

    with self.session(use_gpu=True):
      # With padding, we pad the last frame with pad_value.
      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=True, pad_value=99).eval()
      # Resulting shape is (2, 1, 3, 1, 3, 3).
      expected = [[[[[[0, 1, 2], [2, 3, 4], [4, 5, 99]]],
                    [[[10, 11, 12], [12, 13, 14], [14, 15, 99]]],
                    [[[20, 21, 22], [22, 23, 24], [24, 25, 99]]]]],
                  [[[[[30, 31, 32], [32, 33, 34], [34, 35, 99]]],
                    [[[40, 41, 42], [42, 43, 44], [44, 45, 99]]],
                    [[[50, 51, 52], [52, 53, 54], [54, 55, 99]]]]]]
      self.assertAllEqual(expected, result)

      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=False).eval()
      # Resulting shape is (2, 1, 3, 1, 3, 2).
      expected = [[[[[[0, 1, 2], [2, 3, 4]]],
                    [[[10, 11, 12], [12, 13, 14]]],
                    [[[20, 21, 22], [22, 23, 24]]]]],
                  [[[[[30, 31, 32], [32, 33, 34]]],
                    [[[40, 41, 42], [42, 43, 44]]],
                    [[[50, 51, 52], [52, 53, 54]]]]]]
      self.assertAllEqual(expected, result)
Beispiel #7
0
    def test_basic_stereo(self):
        signal = np.vstack([np.arange(6), np.arange(6) + 10])
        frame_length = 3
        frame_step = 2

        for rank in range(5):
            nd_signal = np.reshape(signal, (1, ) * rank + signal.shape)

            # With padding, we pad the last frame with pad_value.
            result = shape_ops.frame(nd_signal,
                                     frame_length,
                                     frame_step,
                                     pad_end=True,
                                     pad_value=99)
            expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4],
                                               [4, 5, 99]],
                                              [[10, 11, 12], [12, 13, 14],
                                               [14, 15, 99]]])
            expected = np.reshape(expected_inner_frames,
                                  (1, ) * rank + expected_inner_frames.shape)
            self.assertAllEqual(expected, result)

            # Without padding, we drop the last frame.
            expected_inner_frames = np.array([[[0, 1, 2], [2, 3, 4]],
                                              [[10, 11, 12], [12, 13, 14]]])
            expected = np.reshape(expected_inner_frames,
                                  (1, ) * rank + expected_inner_frames.shape)
            result = shape_ops.frame(nd_signal,
                                     frame_length,
                                     frame_step,
                                     pad_end=False)
            self.assertAllEqual(expected, result)
Beispiel #8
0
    def test_window_larger_than_signal(self):
        signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
        frame_length = 4
        frame_step = 1

        result = shape_ops.frame(signal,
                                 frame_length,
                                 frame_step,
                                 pad_end=True,
                                 pad_value=99)
        self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
                             [[11, 12, 99, 99], [12, 99, 99, 99]]], result)

        result = shape_ops.frame(signal,
                                 frame_length,
                                 frame_step,
                                 pad_end=False)
        self.assertEqual((2, 0, 4), result.shape)

        frame_step = 2
        result = shape_ops.frame(signal,
                                 frame_length,
                                 frame_step,
                                 pad_end=True,
                                 pad_value=99)
        self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result)

        result = shape_ops.frame(signal,
                                 frame_length,
                                 frame_step,
                                 pad_end=False)
        self.assertEqual((2, 0, 4), result.shape)
Beispiel #9
0
  def test_length_zero(self):
    signal = constant_op.constant([], dtype=dtypes.float32)
    frame_length = 2
    frame_step = 1

    result = self.evaluate(shape_ops.frame(
        signal, frame_length, frame_step, pad_end=True, pad_value=99))
    self.assertEqual((0, 2), result.shape)

    result = self.evaluate(
        shape_ops.frame(signal, frame_length, frame_step, pad_end=False))
    self.assertEqual((0, 2), result.shape)
Beispiel #10
0
  def test_length_zero(self):
    signal = constant_op.constant([], dtype=dtypes.float32)
    frame_length = 2
    frame_step = 1

    with self.session(use_gpu=True):
      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=True, pad_value=99).eval()
      self.assertEqual((0, 2), result.shape)

      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=False).eval()
      self.assertEqual((0, 2), result.shape)
  def test_length_zero(self):
    signal = constant_op.constant([], dtype=dtypes.float32)
    frame_length = 2
    frame_step = 1

    with self.session(use_gpu=True):
      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=True, pad_value=99).eval()
      self.assertEqual((0, 2), result.shape)

      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=False).eval()
      self.assertEqual((0, 2), result.shape)
Beispiel #12
0
    def test_preserves_type(self):
        signal = math_ops.range(10, dtype=dtypes.float64)
        frame_length = 2
        frame_step = 3

        result = shape_ops.frame(signal, frame_length, frame_step)
        self.assertEqual(result.dtype, signal.dtype)
Beispiel #13
0
def stdct(signals, frame_length, frame_step, fft_length=None,
         window_fn=window_ops.hann_window,
         pad_end=False, name=None):
  """
  Short-time discrete cosine transform.

  Argument/s:

  Returns:
  """
  with ops.name_scope(name, 'stdct', [signals, frame_length,
                                     frame_step]):
    signals = ops.convert_to_tensor(signals, name='signals')
    signals.shape.with_rank_at_least(1)
    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
    frame_length.shape.assert_has_rank(0)
    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
    frame_step.shape.assert_has_rank(0)

    if fft_length is None:
      fft_length = _enclosing_power_of_two(frame_length)
    else:
      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')

    framed_signals = shape_ops.frame(
        signals, frame_length, frame_step, pad_end=pad_end)

    # Optionally window the framed signals.
    if window_fn is not None:
      window = window_fn(frame_length, dtype=framed_signals.dtype)
      framed_signals *= window

    return dct_ops.dct(framed_signals, n=fft_length)
  def test_preserves_type(self):
    signal = math_ops.range(10, dtype=dtypes.float64)
    frame_length = 2
    frame_step = 3

    with self.session(use_gpu=True):
      result = shape_ops.frame(signal, frame_length, frame_step)
      self.assertEqual(result.dtype, signal.dtype)
def stft(signals, frame_length, frame_step, fft_length=None,
         window_fn=window_ops.hann_window,
         pad_end=False, name=None):
  """Computes the [Short-time Fourier Transform][stft] of `signals`.

  Implemented with TPU/GPU-compatible ops and supports gradients.

  Args:
    signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued
      signals.
    frame_length: An integer scalar `Tensor`. The window length in samples.
    frame_step: An integer scalar `Tensor`. The number of samples to step.
    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
      If not provided, uses the smallest power of 2 enclosing `frame_length`.
    window_fn: A callable that takes a window length and a `dtype` keyword
      argument and returns a `[window_length]` `Tensor` of samples in the
      provided datatype. If set to `None`, no windowing is used.
    pad_end: Whether to pad the end of `signals` with zeros when the provided
      frame length and step produces a frame that lies partially past its end.
    name: An optional name for the operation.

  Returns:
    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64`/`complex128`
    STFT values where `fft_unique_bins` is `fft_length // 2 + 1` (the unique
    components of the FFT).

  Raises:
    ValueError: If `signals` is not at least rank 1, `frame_length` is
      not scalar, or `frame_step` is not scalar.

  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
  """
  with ops.name_scope(name, 'stft', [signals, frame_length,
                                     frame_step]):
    signals = ops.convert_to_tensor(signals, name='signals')
    signals.shape.with_rank_at_least(1)
    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
    frame_length.shape.assert_has_rank(0)
    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
    frame_step.shape.assert_has_rank(0)

    if fft_length is None:
      fft_length = _enclosing_power_of_two(frame_length)
    else:
      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')

    framed_signals = shape_ops.frame(
        signals, frame_length, frame_step, pad_end=pad_end)

    # Optionally window the framed signals.
    if window_fn is not None:
      window = window_fn(frame_length, dtype=framed_signals.dtype)
      framed_signals *= window

    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
    # FFT of the real windowed signals in framed_signals.
    return fft_ops.rfft(framed_signals, [fft_length])
Beispiel #16
0
def stft(signals, frame_length, frame_step, fft_length=None,
         window_fn=window_ops.hann_window,
         pad_end=False, name=None):
  """Computes the [Short-time Fourier Transform][stft] of `signals`.

  Implemented with GPU-compatible ops and supports gradients.

  Args:
    signals: A `[..., samples]` `float32` `Tensor` of real-valued signals.
    frame_length: An integer scalar `Tensor`. The window length in samples.
    frame_step: An integer scalar `Tensor`. The number of samples to step.
    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
      If not provided, uses the smallest power of 2 enclosing `frame_length`.
    window_fn: A callable that takes a window length and a `dtype` keyword
      argument and returns a `[window_length]` `Tensor` of samples in the
      provided datatype. If set to `None`, no windowing is used.
    pad_end: Whether to pad the end of `signals` with zeros when the provided
      frame length and step produces a frame that lies partially past its end.
    name: An optional name for the operation.

  Returns:
    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where
    `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the
    FFT).

  Raises:
    ValueError: If `signals` is not at least rank 1, `frame_length` is
      not scalar, or `frame_step` is not scalar.

  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
  """
  with ops.name_scope(name, 'stft', [signals, frame_length,
                                     frame_step]):
    signals = ops.convert_to_tensor(signals, name='signals')
    signals.shape.with_rank_at_least(1)
    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
    frame_length.shape.assert_has_rank(0)
    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
    frame_step.shape.assert_has_rank(0)

    if fft_length is None:
      fft_length = _enclosing_power_of_two(frame_length)
    else:
      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')

    framed_signals = shape_ops.frame(
        signals, frame_length, frame_step, pad_end=pad_end)

    # Optionally window the framed signals.
    if window_fn is not None:
      window = window_fn(frame_length, dtype=framed_signals.dtype)
      framed_signals *= window

    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
    # FFT of the real windowed signals in framed_signals.
    return fft_ops.rfft(framed_signals, [fft_length])
Beispiel #17
0
    def test_invalid_inputs(self):
        # Rank 0 input signal.
        with self.assertRaises(ValueError):
            shape_ops.frame(1, 1, 1)

        # If the rank is unknown, do not raise an exception.
        shape_ops.frame(array_ops.placeholder(dtypes.float32), 1, 1)

        # Non-scalar frame_length.
        with self.assertRaises(ValueError):
            shape_ops.frame([1], [1], 1)

        # Non-scalar frame_step.
        with self.assertRaises(ValueError):
            shape_ops.frame([1], 1, [1])

        # Non-scalar pad_value.
        with self.assertRaises(ValueError):
            shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
Beispiel #18
0
 def test_gradient_numerical(self):
     with self.session(use_gpu=True):
         signal_shape = (2, 128)
         signal = array_ops.ones(signal_shape)
         frame_length = 33
         frame_step = 9
         frames = shape_ops.frame(signal, frame_length, frame_step)
         error = test.compute_gradient_error(signal, signal_shape, frames,
                                             frames.shape.as_list())
         self.assertLess(error, 2e-5)
 def test_gradient_numerical(self):
   with self.session(use_gpu=True):
     signal_shape = (2, 128)
     signal = array_ops.ones(signal_shape)
     frame_length = 33
     frame_step = 9
     frames = shape_ops.frame(signal, frame_length, frame_step)
     error = test.compute_gradient_error(
         signal, signal_shape, frames, frames.shape.as_list())
     self.assertLess(error, 2e-5)
  def test_invalid_inputs(self):
    # Rank 0 input signal.
    with self.assertRaises(ValueError):
      shape_ops.frame(1, 1, 1)

    # If the rank is unknown, do not raise an exception.
    shape_ops.frame(array_ops.placeholder(dtypes.float32), 1, 1)

    # Non-scalar frame_length.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], [1], 1)

    # Non-scalar frame_step.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], 1, [1])

    # Non-scalar pad_value.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
Beispiel #21
0
  def test_shape_inference(self):
    if context.executing_eagerly():
      return
    signal = array_ops.zeros((1, 1), dtype=dtypes.int32)
    frame_length = 2
    frame_step = 1
    # Shape inference is able to detect the rank and inner-most dimension
    # if frame_length is known at graph definition time.
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99)
    self.assertEqual([1, 1, 2], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False)
    self.assertEqual([1, 0, 2], result.shape.as_list())

    # If frame_length is not known, rank and (known) outer and inner dimensions
    # are inferred.
    signal = array_ops.zeros([1, 2, 3, 4], dtype=dtypes.int32)
    frame_length = array_ops.placeholder_with_default(
        ops.convert_to_tensor(0, dtypes.int32), shape=[])
    frame_step = 1
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=1)
    self.assertEqual([1, 2, None, 3, 4], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=1)
    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())

    # If frame_length and inner-most dimension is known, rank, inner dimensions,
    # and known outer dimensions are inferred.
    signal = array_ops.placeholder_with_default(
        array_ops.zeros((0, 5, 0, 20, 5, 3), dtype=dtypes.int32),
        shape=[None, 5, None, 20, 5, 3])
    frame_length = 4
    frame_step = 3
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=3)
    self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=3)
    self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list())

    # Test that shape inference is consistent with actual returned shapes for
    # small values of signal_length, frame_length, frame_step, and pad_end in
    # [True, False].
    frame_step = 1
    for signal_length in range(2):
      signal = [0] * signal_length
      for frame_length in range(2):
        for pad_end in [False, True]:
          op = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=pad_end, pad_value=99)
          result = self.evaluate(op)
          self.assertEqual(op.shape.as_list(), list(result.shape))
Beispiel #22
0
 def test_gradient_numerical(self):
     if context.executing_eagerly():
         return
     with self.session():
         signal_shape = (2, 128)
         signal = array_ops.ones(signal_shape)
         frame_length = 33
         frame_step = 9
         frames = shape_ops.frame(signal, frame_length, frame_step)
         error = test.compute_gradient_error(signal, signal_shape, frames,
                                             frames.shape.as_list())
         self.assertLess(error, 2e-5)
Beispiel #23
0
 def test_constant_folding(self):
   """frame should be constant foldable for constant inputs."""
   for pad_end in [True, False]:
     g = ops.Graph()
     with g.as_default():
       frame_length, frame_step = 32, 16
       signal_shape = (2, 128)
       signal = array_ops.ones(signal_shape)
       frames = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end)
       rewritten_graph = test_util.grappler_optimize(g, [frames])
       self.assertEqual(1, len(rewritten_graph.node))
Beispiel #24
0
  def test_invalid_inputs(self):
    # Rank 0 input signal.
    with self.assertRaises(ValueError):
      shape_ops.frame(1, 1, 1)

    if not context.executing_eagerly():
      # If the rank is unknown, do not raise an exception.
      shape_ops.frame(array_ops.placeholder_with_default(
          1, shape=tensor_shape.TensorShape(None)), 1, 1)

    # Non-scalar frame_length.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], [1], 1)

    # Non-scalar frame_step.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], 1, [1])

    # Non-scalar pad_value.
    with self.assertRaises(ValueError):
      shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
Beispiel #25
0
    def test_mapping_of_indices_without_padding(self):
        tensor = constant_op.constant(np.arange(9152), dtypes.int32)
        tensor = array_ops.expand_dims(tensor, 0)

        result = shape_ops.frame(tensor, 512, 180, pad_end=False)

        expected = np.tile(np.arange(512), (49, 1))
        expected += np.tile(np.arange(49) * 180, (512, 1)).T

        expected = np.expand_dims(expected, axis=0)
        expected = np.array(expected, dtype=np.int32)
        self.assertAllEqual(expected, result)
 def test_constant_folding(self):
   """frame should be constant foldable for constant inputs."""
   for pad_end in [True, False]:
     g = ops.Graph()
     with g.as_default():
       frame_length, frame_step = 32, 16
       signal_shape = (2, 128)
       signal = array_ops.ones(signal_shape)
       frames = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end)
       rewritten_graph = test_util.grappler_optimize(g, [frames])
       self.assertEqual(1, len(rewritten_graph.node))
  def test_shape_inference(self):
    signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
    frame_length = 2
    frame_step = 1
    # Shape inference is able to detect the rank and inner-most dimension
    # if frame_length is known at graph definition time.
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99)
    self.assertEqual([1, 1, 2], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False)
    self.assertEqual([1, 0, 2], result.shape.as_list())

    # If frame_length is not known, rank and (known) outer and inner dimensions
    # are inferred.
    signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4])
    frame_length = array_ops.placeholder(dtypes.int32, shape=[])
    frame_step = 1
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=1)
    self.assertEqual([1, 2, None, 3, 4], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=1)
    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())

    # If frame_length and inner-most dimension is known, rank, inner dimensions,
    # and known outer dimensions are inferred.
    signal = array_ops.placeholder(dtypes.int32,
                                   shape=[None, 5, None, 20, 5, 3])
    frame_length = 4
    frame_step = 3
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=3)
    self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=3)
    self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list())

    # Test that shape inference is consistent with actual returned shapes for
    # small values of signal_length, frame_length, frame_step, and pad_end in
    # [True, False].
    frame_step = 1
    for signal_length in range(2):
      signal = [0] * signal_length
      for frame_length in range(2):
        for pad_end in [False, True]:
          op = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=pad_end, pad_value=99)
          with self.cached_session(use_gpu=True):
            result = self.evaluate(op)
          self.assertEqual(op.shape.as_list(), list(result.shape))
Beispiel #28
0
  def test_shape_inference(self):
    signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
    frame_length = 2
    frame_step = 1
    # Shape inference is able to detect the rank and inner-most dimension
    # if frame_length is known at graph definition time.
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99)
    self.assertEqual([1, 1, 2], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False)
    self.assertEqual([1, 0, 2], result.shape.as_list())

    # If frame_length is not known, rank and (known) outer and inner dimensions
    # are inferred.
    signal = array_ops.placeholder(dtypes.int32, shape=[1, 2, 3, 4])
    frame_length = array_ops.placeholder(dtypes.int32, shape=[])
    frame_step = 1
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=1)
    self.assertEqual([1, 2, None, 3, 4], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=1)
    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())

    # If frame_length and inner-most dimension is known, rank, inner dimensions,
    # and known outer dimensions are inferred.
    signal = array_ops.placeholder(dtypes.int32,
                                   shape=[None, 5, None, 20, 5, 3])
    frame_length = 4
    frame_step = 3
    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=True, pad_value=99, axis=3)
    self.assertEqual([None, 5, None, 7, 4, 5, 3], result.shape.as_list())

    result = shape_ops.frame(signal, frame_length, frame_step,
                             pad_end=False, axis=3)
    self.assertEqual([None, 5, None, 6, 4, 5, 3], result.shape.as_list())

    # Test that shape inference is consistent with actual returned shapes for
    # small values of signal_length, frame_length, frame_step, and pad_end in
    # [True, False].
    frame_step = 1
    for signal_length in range(2):
      signal = [0] * signal_length
      for frame_length in range(2):
        for pad_end in [False, True]:
          op = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=pad_end, pad_value=99)
          with self.cached_session(use_gpu=True):
            result = op.eval()
          self.assertEqual(op.shape.as_list(), list(result.shape))
  def test_window_larger_than_signal(self):
    signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
    frame_length = 4
    frame_step = 1

    with self.session(use_gpu=True):
      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=True, pad_value=99).eval()
      self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
                           [[11, 12, 99, 99], [12, 99, 99, 99]]], result)

      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=False).eval()
      self.assertEqual((2, 0, 4), result.shape)

      frame_step = 2
      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=True, pad_value=99).eval()
      self.assertAllClose([[[1, 2, 99, 99]], [[11, 12, 99, 99]]], result)

      result = shape_ops.frame(signal, frame_length, frame_step,
                               pad_end=False).eval()
      self.assertEqual((2, 0, 4), result.shape)
  def test_mapping_of_indices_without_padding(self):
    with self.session(use_gpu=True):
      tensor = constant_op.constant(np.arange(9152), dtypes.int32)
      tensor = array_ops.expand_dims(tensor, 0)

      result = shape_ops.frame(tensor, 512, 180, pad_end=False).eval()

      expected = np.tile(np.arange(512), (49, 1))
      expected += np.tile(np.arange(49) * 180, (512, 1)).T

      expected = np.expand_dims(expected, axis=0)
      expected = np.array(expected, dtype=np.int32)

      self.assertAllEqual(expected, result)
Beispiel #31
0
    def test_mapping_of_indices_with_padding(self):
        tensor = constant_op.constant(np.arange(10000), dtypes.int32)
        tensor = array_ops.expand_dims(tensor, 0)

        result = shape_ops.frame(tensor, 512, 192, pad_end=True)

        expected = np.tile(np.arange(512), (53, 1))
        expected += np.tile(np.arange(53) * 192, (512, 1)).T

        expected[expected >= 10000] = 0

        expected = np.expand_dims(expected, axis=0)
        expected = np.array(expected, dtype=np.int32)

        self.assertAllEqual(expected, result)
  def test_dynamic_tensor(self):
    # Show that frame works even when the dimensions of its input are
    # not known at graph creation time.
    input_signal = np.vstack([np.arange(4), np.arange(4) + 10,
                              np.arange(4) + 20])
    frame_length = 2
    frame_step = 2

    with self.session(use_gpu=True) as sess:
      signal_placeholder = array_ops.placeholder(shape=(None, None),
                                                 dtype=dtypes.float32)
      result = sess.run(shape_ops.frame(
          signal_placeholder, frame_length, frame_step),
                        feed_dict={signal_placeholder: input_signal})
      self.assertAllEqual([[[0, 1], [2, 3]],
                           [[10, 11], [12, 13]],
                           [[20, 21], [22, 23]]], result)
Beispiel #33
0
  def test_dynamic_tensor(self):
    # Show that frame works even when the dimensions of its input are
    # not known at graph creation time.
    input_signal = np.vstack([np.arange(4), np.arange(4) + 10,
                              np.arange(4) + 20])
    frame_length = 2
    frame_step = 2

    with self.session(use_gpu=True) as sess:
      signal_placeholder = array_ops.placeholder(shape=(None, None),
                                                 dtype=dtypes.float32)
      result = sess.run(shape_ops.frame(
          signal_placeholder, frame_length, frame_step),
                        feed_dict={signal_placeholder: input_signal})
      self.assertAllEqual([[[0, 1], [2, 3]],
                           [[10, 11], [12, 13]],
                           [[20, 21], [22, 23]]], result)
Beispiel #34
0
  def test_dynamic_tensor(self):
    if context.executing_eagerly():
      return
    # Show that frame works even when the dimensions of its input are
    # not known at graph creation time.
    input_signal = np.vstack([np.arange(4), np.arange(4) + 10,
                              np.arange(4) + 20])
    frame_length = 2
    frame_step = 2

    signal_placeholder = array_ops.placeholder_with_default(
        input_signal, shape=(None, None))
    result = self.evaluate(
        shape_ops.frame(signal_placeholder, frame_length, frame_step))
    self.assertAllEqual([[[0, 1], [2, 3]],
                         [[10, 11], [12, 13]],
                         [[20, 21], [22, 23]]], result)
Beispiel #35
0
def linear_to_mel_weight_matrix(num_mel_bins=20,
                                num_spectrogram_bins=129,
                                sample_rate=8000,
                                lower_edge_hertz=125.0,
                                upper_edge_hertz=3800.0,
                                dtype=dtypes.float32,
                                name=None):
    """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel].

  Returns a weight matrix that can be used to re-weight a `Tensor` containing
  `num_spectrogram_bins` linearly sampled frequency information from
  `[0, sample_rate / 2]` into `num_mel_bins` frequency information from
  `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel].

  For example, the returned matrix `A` can be used to right-multiply a
  spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear
  scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram"
  `M` of shape `[frames, num_mel_bins]`.

      # `S` has shape [frames, num_spectrogram_bins]
      # `M` has shape [frames, num_mel_bins]
      M = tf.matmul(S, A)

  The matrix can be used with `tf.tensordot` to convert an arbitrary rank
  `Tensor` of linear-scale spectral bins into the mel scale.

      # S has shape [..., num_spectrogram_bins].
      # M has shape [..., num_mel_bins].
      M = tf.tensordot(S, A, 1)
      # tf.tensordot does not support shape inference for this case yet.
      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))

  Args:
    num_mel_bins: Python int. How many bands in the resulting mel spectrum.
    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
      i.e. the spectrogram only contains the nonredundant FFT bins.
    sample_rate: Python float. Samples per second of the input signal used to
      create the spectrogram. We need this to figure out the actual frequencies
      for each spectrogram bin, which dictates how they are mapped into the mel
      scale.
    lower_edge_hertz: Python float. Lower bound on the frequencies to be
      included in the mel spectrum. This corresponds to the lower edge of the
      lowest triangular band.
    upper_edge_hertz: Python float. The desired top edge of the highest
      frequency band.
    dtype: The `DType` of the result matrix. Must be a floating point type.
    name: An optional name for the operation.

  Returns:
    A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`.

  Raises:
    ValueError: If `num_mel_bins`/`num_spectrogram_bins`/`sample_rate` are not
      positive, `lower_edge_hertz` is negative, frequency edges are incorrectly
      ordered, `upper_edge_hertz` is larger than the Nyquist frequency, or
      `sample_rate` is neither a Python float nor a constant Tensor.

  [mel]: https://en.wikipedia.org/wiki/Mel_scale
  """
    with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
        # Convert Tensor `sample_rate` to float, if possible.
        if isinstance(sample_rate, ops.Tensor):
            maybe_const_val = tensor_util.constant_value(sample_rate)
            if maybe_const_val is not None:
                sample_rate = maybe_const_val
            else:
                raise ValueError(
                    '`sample_rate` was a non-constant Tensor. Must be a '
                    'Python float or a constant Tensor.')

        # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
        # and the validation is already done in linspace (both in shape function
        # and in kernel), there is no need to validate num_spectrogram_bins here.
        _validate_arguments(num_mel_bins, sample_rate, lower_edge_hertz,
                            upper_edge_hertz, dtype)

        # This function can be constant folded by graph optimization since there are
        # no Tensor inputs.
        sample_rate = ops.convert_to_tensor(sample_rate,
                                            dtype,
                                            name='sample_rate')
        lower_edge_hertz = ops.convert_to_tensor(lower_edge_hertz,
                                                 dtype,
                                                 name='lower_edge_hertz')
        upper_edge_hertz = ops.convert_to_tensor(upper_edge_hertz,
                                                 dtype,
                                                 name='upper_edge_hertz')
        zero = ops.convert_to_tensor(0.0, dtype)

        # HTK excludes the spectrogram DC bin.
        bands_to_zero = 1
        nyquist_hertz = sample_rate / 2.0
        linear_frequencies = math_ops.linspace(
            zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
        spectrogram_bins_mel = array_ops.expand_dims(
            _hertz_to_mel(linear_frequencies), 1)

        # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
        # center of each band is the lower and upper edge of the adjacent bands.
        # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
        # num_mel_bins + 2 pieces.
        band_edges_mel = shape_ops.frame(math_ops.linspace(
            _hertz_to_mel(lower_edge_hertz), _hertz_to_mel(upper_edge_hertz),
            num_mel_bins + 2),
                                         frame_length=3,
                                         frame_step=1)

        # Split the triples up and reshape them into [1, num_mel_bins] tensors.
        lower_edge_mel, center_mel, upper_edge_mel = tuple(
            array_ops.reshape(t, [1, num_mel_bins])
            for t in array_ops.split(band_edges_mel, 3, axis=1))

        # Calculate lower and upper slopes for every spectrogram bin.
        # Line segments are linear in the mel domain, not Hertz.
        lower_slopes = (spectrogram_bins_mel -
                        lower_edge_mel) / (center_mel - lower_edge_mel)
        upper_slopes = (upper_edge_mel -
                        spectrogram_bins_mel) / (upper_edge_mel - center_mel)

        # Intersect the line segments with each other and zero.
        mel_weights_matrix = math_ops.maximum(
            zero, math_ops.minimum(lower_slopes, upper_slopes))

        # Re-add the zeroed lower bins we sliced out above.
        return array_ops.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]],
                             name=name)
Beispiel #36
0
 def fn(signal):
     return shape_ops.frame(signal,
                            length_step[0],
                            length_step[1],
                            pad_end=pad_end)
Beispiel #37
0
def overlap_and_add(signal, frame_step, name=None):
    """Reconstructs a signal from a framed representation.

  Adds potentially overlapping frames of a signal with shape
  `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
  The resulting tensor has shape `[..., output_size]` where

      output_size = (frames - 1) * frame_step + frame_length

  Args:
    signal: A [..., frames, frame_length] `Tensor`. All dimensions may be
      unknown, and rank must be at least 2.
    frame_step: An integer or scalar `Tensor` denoting overlap offsets. Must be
      less than or equal to `frame_length`.
    name: An optional name for the operation.

  Returns:
    A `Tensor` with shape `[..., output_size]` containing the overlap-added
    frames of `signal`'s inner-most two dimensions.

  Raises:
    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
      integer or `frame_step` is greater than `frame_length`.
  """
    with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
        signal = ops.convert_to_tensor(signal, name="signal")
        signal.shape.with_rank_at_least(2)
        frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
        frame_step.shape.assert_has_rank(0)
        if not frame_step.dtype.is_integer:
            raise ValueError("frame_step must be an integer. Got %s" %
                             frame_step.dtype)

        signal_shape = array_ops.shape(signal)

        # All dimensions that are not part of the overlap-and-add. Can be empty for
        # rank 2 inputs.
        outer_dimensions = signal_shape[:-2]

        # If frame_length and frame_step are known at graph construction time, check
        # frame_step is less than or equal to frame_length.
        frame_step_static = tensor_util.constant_value(frame_step)
        if (frame_step_static is not None and signal.shape.ndims is not None
                and signal.shape.dims[-1].value is not None):
            if frame_step_static > signal.shape.dims[-1].value:
                raise ValueError(
                    "frame_step (%d) must be less than or equal to "
                    "frame_length (%d)" %
                    (frame_step_static, signal.shape.dims[-1].value))
            # If frame_length is equal to frame_step, there's no overlap so just
            # reshape the tensor.
            if frame_step_static == signal.shape.dims[-1].value:
                return array_ops.reshape(
                    signal, array_ops.concat([outer_dimensions, [-1]], 0))

        signal_rank = array_ops.rank(signal)
        frames = signal_shape[-2]
        frame_length = signal_shape[-1]

        subframe_length = util_ops.gcd(frame_length, frame_step)
        subframe_step = frame_step // subframe_length
        subframes_per_frame = frame_length // subframe_length
        output_size = frame_step * (frames - 1) + frame_length
        output_subframes = output_size // subframe_length

        # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
        # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
        # from [..., frames, frame_length] into [..., subframes, subframe_length].
        subframe_shape = array_ops.concat(
            [outer_dimensions, [-1, subframe_length]], 0)
        subframe_signal = array_ops.reshape(signal, subframe_shape)

        # Now we shuffle the last [subframes, subframe_length] dimensions to the
        # front.
        # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
        # avoid this pair of transposes.
        subframe_signal = _shuffle_to_front(subframe_signal, 2)

        # Use unsorted_segment_sum to add overlapping subframes together.
        segment_ids = array_ops.reshape(
            shape_ops.frame(math_ops.range(output_subframes),
                            subframes_per_frame,
                            subframe_step,
                            pad_end=False), [-1])
        result = math_ops.unsorted_segment_sum(subframe_signal,
                                               segment_ids,
                                               num_segments=output_subframes)

        # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
        # return a [...outer_dimensions, output_size] tensor with a transpose and
        # reshape.
        result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
        return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
                                 result_shape)
Beispiel #38
0
def linear_to_mel_weight_matrix(num_mel_bins=20,
                                num_spectrogram_bins=129,
                                sample_rate=8000,
                                lower_edge_hertz=125.0,
                                upper_edge_hertz=3800.0,
                                dtype=dtypes.float32,
                                name=None):
  """Returns a matrix to warp linear scale spectrograms to the [mel scale][mel].

  Returns a weight matrix that can be used to re-weight a `Tensor` containing
  `num_spectrogram_bins` linearly sampled frequency information from
  `[0, sample_rate / 2]` into `num_mel_bins` frequency information from
  `[lower_edge_hertz, upper_edge_hertz]` on the [mel scale][mel].

  For example, the returned matrix `A` can be used to right-multiply a
  spectrogram `S` of shape `[frames, num_spectrogram_bins]` of linear
  scale spectrum values (e.g. STFT magnitudes) to generate a "mel spectrogram"
  `M` of shape `[frames, num_mel_bins]`.

      # `S` has shape [frames, num_spectrogram_bins]
      # `M` has shape [frames, num_mel_bins]
      M = tf.matmul(S, A)

  The matrix can be used with `tf.tensordot` to convert an arbitrary rank
  `Tensor` of linear-scale spectral bins into the mel scale.

      # S has shape [..., num_spectrogram_bins].
      # M has shape [..., num_mel_bins].
      M = tf.tensordot(S, A, 1)
      # tf.tensordot does not support shape inference for this case yet.
      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))

  Args:
    num_mel_bins: Python int. How many bands in the resulting mel spectrum.
    num_spectrogram_bins: An integer `Tensor`. How many bins there are in the
      source spectrogram data, which is understood to be `fft_size // 2 + 1`,
      i.e. the spectrogram only contains the nonredundant FFT bins.
    sample_rate: Python float. Samples per second of the input signal used to
      create the spectrogram. We need this to figure out the actual frequencies
      for each spectrogram bin, which dictates how they are mapped into the mel
      scale.
    lower_edge_hertz: Python float. Lower bound on the frequencies to be
      included in the mel spectrum. This corresponds to the lower edge of the
      lowest triangular band.
    upper_edge_hertz: Python float. The desired top edge of the highest
      frequency band.
    dtype: The `DType` of the result matrix. Must be a floating point type.
    name: An optional name for the operation.

  Returns:
    A `Tensor` of shape `[num_spectrogram_bins, num_mel_bins]`.

  Raises:
    ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not
      positive, lower_edge_hertz is negative, frequency edges are incorrectly
      ordered, or upper_edge_hertz is larger than the Nyquist frequency.

  [mel]: https://en.wikipedia.org/wiki/Mel_scale
  """
  with ops.name_scope(name, 'linear_to_mel_weight_matrix') as name:
    # Note: As num_spectrogram_bins is passed to `math_ops.linspace`
    # and the validation is already done in linspace (both in shape function
    # and in kernel), there is no need to validate num_spectrogram_bins here.
    _validate_arguments(num_mel_bins, sample_rate,
                        lower_edge_hertz, upper_edge_hertz, dtype)

    # This function can be constant folded by graph optimization since there are
    # no Tensor inputs.
    sample_rate = ops.convert_to_tensor(
        sample_rate, dtype, name='sample_rate')
    lower_edge_hertz = ops.convert_to_tensor(
        lower_edge_hertz, dtype, name='lower_edge_hertz')
    upper_edge_hertz = ops.convert_to_tensor(
        upper_edge_hertz, dtype, name='upper_edge_hertz')
    zero = ops.convert_to_tensor(0.0, dtype)

    # HTK excludes the spectrogram DC bin.
    bands_to_zero = 1
    nyquist_hertz = sample_rate / 2.0
    linear_frequencies = math_ops.linspace(
        zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
    spectrogram_bins_mel = array_ops.expand_dims(
        _hertz_to_mel(linear_frequencies), 1)

    # Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
    # center of each band is the lower and upper edge of the adjacent bands.
    # Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
    # num_mel_bins + 2 pieces.
    band_edges_mel = shape_ops.frame(
        math_ops.linspace(_hertz_to_mel(lower_edge_hertz),
                          _hertz_to_mel(upper_edge_hertz),
                          num_mel_bins + 2), frame_length=3, frame_step=1)

    # Split the triples up and reshape them into [1, num_mel_bins] tensors.
    lower_edge_mel, center_mel, upper_edge_mel = tuple(array_ops.reshape(
        t, [1, num_mel_bins]) for t in array_ops.split(
            band_edges_mel, 3, axis=1))

    # Calculate lower and upper slopes for every spectrogram bin.
    # Line segments are linear in the mel domain, not Hertz.
    lower_slopes = (spectrogram_bins_mel - lower_edge_mel) / (
        center_mel - lower_edge_mel)
    upper_slopes = (upper_edge_mel - spectrogram_bins_mel) / (
        upper_edge_mel - center_mel)

    # Intersect the line segments with each other and zero.
    mel_weights_matrix = math_ops.maximum(
        zero, math_ops.minimum(lower_slopes, upper_slopes))

    # Re-add the zeroed lower bins we sliced out above.
    return array_ops.pad(
        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]], name=name)
Beispiel #39
0
def mdct(signals,
         frame_length,
         window_fn=window_ops.vorbis_window,
         pad_end=False,
         norm=None,
         name=None):
    """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.

  Implemented with TPU/GPU-compatible ops and supports gradients.

  Args:
    signals: A `[..., samples]` `float32`/`float64` `Tensor` of real-valued
      signals.
    frame_length: An integer scalar `Tensor`. The window length in samples
      which must be divisible by 4.
    window_fn: A callable that takes a window length and a `dtype` keyword
      argument and returns a `[window_length]` `Tensor` of samples in the
      provided datatype. If set to `None`, no windowing is used.
    pad_end: Whether to pad the end of `signals` with zeros when the provided
      frame length and step produces a frame that lies partially past its end.
    norm: If it is None, unnormalized dct4 is used, if it is "ortho"
      orthonormal dct4 is used.
    name: An optional name for the operation.

  Returns:
    A `[..., frames, frame_length // 2]` `Tensor` of `float32`/`float64`
    MDCT values where `frames` is roughly `samples // (frame_length // 2)`
    when `pad_end=False`.

  Raises:
    ValueError: If `signals` is not at least rank 1, `frame_length` is
      not scalar, or `frame_length` is not a multiple of `4`.

  [mdct]: https://en.wikipedia.org/wiki/Modified_discrete_cosine_transform
  """
    with ops.name_scope(name, 'mdct', [signals, frame_length]):
        signals = ops.convert_to_tensor(signals, name='signals')
        signals.shape.with_rank_at_least(1)
        frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
        frame_length.shape.assert_has_rank(0)
        # Assert that frame_length is divisible by 4.
        frame_length_static = tensor_util.constant_value(frame_length)
        if frame_length_static is not None:
            if frame_length_static % 4 != 0:
                raise ValueError('The frame length must be a multiple of 4.')
            frame_step = ops.convert_to_tensor(frame_length_static // 2,
                                               dtype=frame_length.dtype)
        else:
            frame_step = frame_length // 2

        framed_signals = shape_ops.frame(signals,
                                         frame_length,
                                         frame_step,
                                         pad_end=pad_end)

        # Optionally window the framed signals.
        if window_fn is not None:
            window = window_fn(frame_length, dtype=framed_signals.dtype)
            framed_signals *= window
        else:
            framed_signals *= 1.0 / np.sqrt(2)

        split_frames = array_ops.split(framed_signals, 4, axis=-1)
        frame_firsthalf = -array_ops.reverse(split_frames[2],
                                             [-1]) - split_frames[3]
        frame_secondhalf = split_frames[0] - array_ops.reverse(
            split_frames[1], [-1])
        frames_rearranged = array_ops.concat(
            (frame_firsthalf, frame_secondhalf), axis=-1)
        # Below call produces the (frame_length // 2) unique components of the
        # type 4 orthonormal DCT of the real windowed signals in frames_rearranged.
        return dct_ops.dct(frames_rearranged, type=4, norm=norm)