Beispiel #1
0
def random_stretch_squeeze(inputs,
                           resample_offset,
                           seed=None):
  """Stretches and squeezes audio data in time dim.

  It can be useful for augmenting training data
  with random stretchs squeezes in time dim
  for making model more robust to input audio sampling frequency
  and human speech frequency.

  Args:
    inputs: input tensor [batch_size, time]
    resample_offset: defines stretch squeeze range:
      1-resample_offset...1+resample_offset
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
  if inputs.shape.rank != 2:
    raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

  inputs_shape = inputs.shape.as_list()
  batch_size = inputs_shape[0]
  sequence_length = inputs_shape[1]

  image = tf.expand_dims(inputs, 2)  # feature
  image = tf.expand_dims(image, 3)  # channels

  resample = 1.0  # when it is equal to 1 - no stretching or squeezing
  time_stretch_squeeze = tf.random.uniform(
      shape=[batch_size],
      minval=resample - resample_offset,
      maxval=resample + resample_offset,
      dtype=tf.float32,
      seed=seed)
  tf.print(time_stretch_squeeze)
  print(time_stretch_squeeze)
  shape = tf.shape(inputs)
  outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(batch_size):
    image_resized = tf.image.resize(
        images=image[i],
        size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]),
                      tf.int32), 1),
        preserve_aspect_ratio=False)
    image_resized_cropped = tf.image.resize_with_crop_or_pad(
        image_resized,
        target_height=sequence_length,
        target_width=1,
    )

    outputs = outputs.write(i, image_resized_cropped)

  outputs = tf.squeeze(outputs.stack(), axis=[2, 3])
  outputs.set_shape(inputs_shape)
  return outputs
 def _non_streaming(self, inputs):
     # note that if not rectangular window_fn is used then,
     # the first and last reconstructed frames will be numerically different
     # from the original audio frames
     output = tf.signal.inverse_stft(inputs,
                                     self.frame_size,
                                     self.frame_step,
                                     self.fft_size,
                                     window_fn=self.window_fn)
     return tf.cast(output, tf.float32)
    def _streaming_external_state(self, inputs, state):
        state = [] if state is None else state

        # compute inversed FT of any number of input frames
        inversed_frame = tf.signal.inverse_stft(inputs,
                                                self.frame_size,
                                                self.frame_step,
                                                self.fft_size,
                                                window_fn=self.window_fn)
        inversed_frame = tf.cast(inversed_frame, tf.float32)

        # if there is no overlap between frames then
        # there is no need in streaming state processing
        if self.frame_size - self.frame_step <= 0:
            return inversed_frame, state

        if self.use_one_step:  # streaming with input frame by frame
            # update frame state
            new_frame_state = state + inversed_frame[:, 0:self.frame_size]

            # get output hop before frame shifting
            inversed_frames = new_frame_state[:, 0:self.frame_step]

            # shift frame samples by frame_step to the left: ring buffer
            new_frame_state = tf.concat(
                [new_frame_state,
                 tf.zeros([1, self.frame_step])], axis=1)
            new_frame_state = new_frame_state[:, -self.frame_size:]
        else:  # streaming with several input frames
            previous_state = state + inversed_frame[:, 0:self.frame_size]

            new_frame_state = tf.concat(
                [previous_state, inversed_frame[:, self.frame_size:]], axis=1)

            # get output hops before frame shifting
            inversed_frames = new_frame_state[:, 0:self.frame_step *
                                              self.input_frames]

            # shift frame samples by frame_step to the left: ring buffer
            new_frame_state = tf.concat(
                [new_frame_state,
                 tf.zeros([1, self.frame_step])], axis=1)
            new_frame_state = new_frame_state[:, -self.frame_size:]

        return inversed_frames, new_frame_state