Ejemplo n.º 1
0
    def _non_streaming(self, inputs):
        # depthwise 1D convolution in non streaming mode
        # it is used for training or non streaming inference.
        # Zero pad inputs from the left to make conv1d causal.
        # [batch_size, time_steps, feature_dim]
        if self.pad:
            inputs_pad = tf.keras.backend.temporal_padding(
                inputs, padding=(self.memory_size - 1, 0))
        else:
            inputs_pad = inputs

        # expand dimensionality for depthwise_conv2d
        # to [memory_size, 1, feature_dim, 1]
        time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1),
                                         -1)

        # run convolution
        depthwise_conv1d = tf.nn.depthwise_conv2d(
            tf.expand_dims(inputs_pad, -2),
            time_kernel_exp,
            strides=[1, 1, 1, 1],
            padding='VALID')  # [batch_size, time_steps, 1, feature_dim]

        # [batch_size, time_steps, feature_dim]
        depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2])

        # [batch_size, time_steps, feature_dim]
        if self.use_bias:
            depthwise_conv1d = depthwise_conv1d + self.bias

        return depthwise_conv1d
    def _non_streaming(self, inputs):
        # depthwise 1D convolution in non streaming mode
        # it is used for training or non streaming inference.

        # pad input data
        inputs_pad = temporal_padding.TemporalPadding(
            padding=self.pad, padding_size=self.memory_size - 1)(inputs)

        # expand dimensionality for depthwise_conv2d
        # to [memory_size, 1, feature_dim, 1]
        time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1),
                                         -1)

        # run convolution
        depthwise_conv1d = tf.nn.depthwise_conv2d(
            tf.expand_dims(inputs_pad, -2),
            time_kernel_exp,
            strides=[1, 1, 1, 1],
            padding='VALID')  # [batch_size, time_steps, 1, feature_dim]

        # [batch_size, time_steps, feature_dim]
        depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2])

        # [batch_size, time_steps, feature_dim]
        if self.use_bias:
            depthwise_conv1d = depthwise_conv1d + self.bias

        return depthwise_conv1d
Ejemplo n.º 3
0
def random_stretch_squeeze(inputs,
                           resample_offset,
                           seed=None):
  """Stretches and squeezes audio data in time dim.

  It can be useful for augmenting training data
  with random stretchs squeezes in time dim
  for making model more robust to input audio sampling frequency
  and human speech frequency.

  Args:
    inputs: input tensor [batch_size, time]
    resample_offset: defines stretch squeeze range:
      1-resample_offset...1+resample_offset
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
  if inputs.shape.rank != 2:
    raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

  inputs_shape = inputs.shape.as_list()
  batch_size = inputs_shape[0]
  sequence_length = inputs_shape[1]

  image = tf.expand_dims(inputs, 2)  # feature
  image = tf.expand_dims(image, 3)  # channels

  resample = 1.0  # when it is equal to 1 - no stretching or squeezing
  time_stretch_squeeze = tf.random.uniform(
      shape=[batch_size],
      minval=resample - resample_offset,
      maxval=resample + resample_offset,
      dtype=tf.float32,
      seed=seed)
  tf.print(time_stretch_squeeze)
  print(time_stretch_squeeze)
  shape = tf.shape(inputs)
  outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(batch_size):
    image_resized = tf.image.resize(
        images=image[i],
        size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]),
                      tf.int32), 1),
        preserve_aspect_ratio=False)
    image_resized_cropped = tf.image.resize_with_crop_or_pad(
        image_resized,
        target_height=sequence_length,
        target_width=1,
    )

    outputs = outputs.write(i, image_resized_cropped)

  outputs = tf.squeeze(outputs.stack(), axis=[2, 3])
  outputs.set_shape(inputs_shape)
  return outputs
Ejemplo n.º 4
0
def random_shift(inputs, time_shift, seed=None):
    """Shifts input data randomly in time dim.

  It can be useful for augmenting training data with random shifts in time dim
  for making model more robust to input audio shifts

  Args:
    inputs: input tensor [batch_size, time]
    time_shift: defines time shift range: -time_shift...time_shift
      it is defiend in samples
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
    if inputs.shape.rank != 2:
        raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

    inputs_shape = inputs.shape.as_list()
    batch_size = inputs_shape[0]
    sequence_length = inputs_shape[1]

    # below function will process 2D arrays, convert it to [batch, time, dummy]
    inputs = tf.expand_dims(inputs, 2)

    time_shift_amounts = tf.random.uniform(shape=[batch_size],
                                           minval=-time_shift,
                                           maxval=time_shift,
                                           dtype=tf.int32,
                                           seed=seed)

    outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
    for i in tf.range(batch_size):
        time_shift_amount = time_shift_amounts[i]

        # pylint: disable=cell-var-from-loop
        time_shift_padding = tf.cond(time_shift_amount > 0,
                                     lambda: [[time_shift_amount, 0], [0, 0]],
                                     lambda: [[0, -time_shift_amount], [0, 0]])
        time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0],
                                    lambda: [-time_shift_amount, 0])
        # pylint: enable=cell-var-from-loop

        padded = tf.pad(tensor=inputs[i],
                        paddings=time_shift_padding,
                        mode='CONSTANT')
        padded_sliced = tf.slice(padded, time_shift_offset,
                                 [sequence_length, -1])

        outputs = outputs.write(i, padded_sliced)

    # convert it back to [batch, time]
    outputs = tf.squeeze(outputs.stack(), axis=[2])
    outputs.set_shape(inputs_shape)
    return outputs
 def call(self, inputs):
     # inputs [batch_size, time1, feature1, feature2]
     time_kernel_exp = tf.expand_dims(self.filters, -1)
     # it can be replaced by AveragePooling2D with temporal padding
     # and optimized for streaming mode
     # output will be [batch_size, time1, feature1, feature2]
     return tf.nn.depthwise_conv2d(inputs,
                                   time_kernel_exp,
                                   strides=self.strides,
                                   padding=self.padding.upper(),
                                   dilations=self.dilation_rate,
                                   name=self.name + '_averPool2D')
Ejemplo n.º 6
0
def random_cutout(
    inputs,
    mask_size,
    mask_value=0,
    seed=None,
    data_format='channels_last',
):
  """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs.

  It is based on addons/tensorflow_addons/image/cutout_ops.py
  kept here here for backward compatibility

  Args:
    inputs: input tensor [batch_size, time, feature, channels]
    mask_size: mask size (time feature)
    mask_value: mask will be filled with this value
    seed: random seed
    data_format: dimesnions order
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 4
  """

  if inputs.shape.rank != 4:
    raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank)

  mask_size = tf.convert_to_tensor(mask_size)
  if tf.rank(mask_size) == 0:
    mask_size = tf.stack([mask_size, mask_size])

  if data_format == 'channels_last':
    time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2]
  else:
    time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3]

  batch_size = tf.shape(inputs)[0]

  cutout_center_time = tf.random.uniform(
      shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed
  )
  cutout_center_feature = tf.random.uniform(
      shape=[batch_size],
      minval=0,
      maxval=feature_size,
      dtype=tf.int32,
      seed=seed)
  offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0])
  origin_shape = inputs.shape
  offset = tf.convert_to_tensor(offset)
  mask_size = mask_size // 2
  cutout_center_time = offset[:, 0]
  cutout_center_feature = offset[:, 1]

  lower_pads = tf.maximum(0, cutout_center_time - mask_size[0])
  upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0])
  left_pads = tf.maximum(0, cutout_center_feature - mask_size[1])
  right_pads = tf.maximum(0,
                          feature_size - cutout_center_feature - mask_size[1])

  cutout_shape = tf.transpose(
      [
          time_size - (lower_pads + upper_pads),
          feature_size - (left_pads + right_pads),
      ],
      [1, 0],
  )
  masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(tf.shape(cutout_shape)[0]):
    padding_dims = [
        [lower_pads[i], upper_pads[i]],
        [left_pads[i], right_pads[i]],
    ]
    mask = tf.pad(
        tf.zeros(cutout_shape[i], dtype=inputs.dtype),
        padding_dims,
        constant_values=1,
    )
    masks = masks.write(i, mask)

  if data_format == 'channels_last':
    mask = tf.expand_dims(masks.stack(), -1)
    mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]])
  else:
    mask = tf.expand_dims(masks.stack(), 1)
    mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1])

  inputs = tf.where(
      tf.equal(mask, 0),
      tf.ones_like(inputs, dtype=inputs.dtype) * mask_value,
      inputs,
  )
  inputs.set_shape(origin_shape)
  return inputs