Beispiel #1
0
    def _mfcc_op(self, inputs):
        # MFCC implementation based on TF custom op (supported by TFLite)
        # It reduces model size in comparison to _mfcc_tf
        if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
                or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE):
            outputs = self.data_frame(inputs)
            # in streaming mode there is only one frame for FFT calculation
            # dims will be [batch=1, time=1, frame],
            # but audio_spectrogram requre 2D input data, so we remove time dim
            outputs = tf.squeeze(outputs, axis=1)
        else:
            outputs = inputs

        # outputs has dims [batch, time]
        # but audio_spectrogram expects [time, channels/batch] so transpose it
        outputs = tf.transpose(outputs, [1, 0])

        # outputs: [time, channels/batch]
        outputs = audio_ops.audio_spectrogram(
            outputs,
            window_size=self.frame_size,
            stride=self.frame_step,
            magnitude_squared=self.params['fft_magnitude_squared'])
        # outputs: [channels/batch, frames, fft_feature]

        outputs = audio_ops.mfcc(
            outputs,
            self.params['sample_rate'],
            upper_frequency_limit=self.params['mel_upper_edge_hertz'],
            lower_frequency_limit=self.params['mel_lower_edge_hertz'],
            filterbank_channel_count=self.params['mel_num_bins'],
            dct_coefficient_count=self.params['dct_num_features'])
        # outputs: [channels/batch, frames, dct_coefficient_count]
        outputs = self.spec_augment(outputs)
        return outputs
Beispiel #2
0
def random_cutout(
    inputs,
    mask_size,
    mask_value=0,
    seed=None,
    data_format='channels_last',
):
  """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs.

  It is based on addons/tensorflow_addons/image/cutout_ops.py
  kept here here for backward compatibility

  Args:
    inputs: input tensor [batch_size, time, feature, channels]
    mask_size: mask size (time feature)
    mask_value: mask will be filled with this value
    seed: random seed
    data_format: dimesnions order
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 4
  """

  if inputs.shape.rank != 4:
    raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank)

  mask_size = tf.convert_to_tensor(mask_size)
  if tf.rank(mask_size) == 0:
    mask_size = tf.stack([mask_size, mask_size])

  if data_format == 'channels_last':
    time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2]
  else:
    time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3]

  batch_size = tf.shape(inputs)[0]

  cutout_center_time = tf.random.uniform(
      shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed
  )
  cutout_center_feature = tf.random.uniform(
      shape=[batch_size],
      minval=0,
      maxval=feature_size,
      dtype=tf.int32,
      seed=seed)
  offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0])
  origin_shape = inputs.shape
  offset = tf.convert_to_tensor(offset)
  mask_size = mask_size // 2
  cutout_center_time = offset[:, 0]
  cutout_center_feature = offset[:, 1]

  lower_pads = tf.maximum(0, cutout_center_time - mask_size[0])
  upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0])
  left_pads = tf.maximum(0, cutout_center_feature - mask_size[1])
  right_pads = tf.maximum(0,
                          feature_size - cutout_center_feature - mask_size[1])

  cutout_shape = tf.transpose(
      [
          time_size - (lower_pads + upper_pads),
          feature_size - (left_pads + right_pads),
      ],
      [1, 0],
  )
  masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(tf.shape(cutout_shape)[0]):
    padding_dims = [
        [lower_pads[i], upper_pads[i]],
        [left_pads[i], right_pads[i]],
    ]
    mask = tf.pad(
        tf.zeros(cutout_shape[i], dtype=inputs.dtype),
        padding_dims,
        constant_values=1,
    )
    masks = masks.write(i, mask)

  if data_format == 'channels_last':
    mask = tf.expand_dims(masks.stack(), -1)
    mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]])
  else:
    mask = tf.expand_dims(masks.stack(), 1)
    mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1])

  inputs = tf.where(
      tf.equal(mask, 0),
      tf.ones_like(inputs, dtype=inputs.dtype) * mask_value,
      inputs,
  )
  inputs.set_shape(origin_shape)
  return inputs