def _non_streaming(self, inputs):
        # depthwise 1D convolution in non streaming mode
        # it is used for training or non streaming inference.

        # pad input data
        inputs_pad = temporal_padding.TemporalPadding(
            padding=self.pad, padding_size=self.memory_size - 1)(inputs)

        # expand dimensionality for depthwise_conv2d
        # to [memory_size, 1, feature_dim, 1]
        time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1),
                                         -1)

        # run convolution
        depthwise_conv1d = tf.nn.depthwise_conv2d(
            tf.expand_dims(inputs_pad, -2),
            time_kernel_exp,
            strides=[1, 1, 1, 1],
            padding='VALID')  # [batch_size, time_steps, 1, feature_dim]

        # [batch_size, time_steps, feature_dim]
        depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2])

        # [batch_size, time_steps, feature_dim]
        if self.use_bias:
            depthwise_conv1d = depthwise_conv1d + self.bias

        return depthwise_conv1d
Exemple #2
0
def conv_model_no_stream_wrapper(flags, conv_cell, cnn_filters, cnn_kernel_size,
                                 cnn_act, cnn_dilation_rate, cnn_strides,
                                 cnn_use_bias):
  """Toy example of convolutional model.

  It has the same model topology as in conv_model() above, but without
  wrapping conv cell by Stream layer, so that all parameters set manually.
  Args:
      flags: model and data settings
      conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_dilation_rate: list of dilation_rate in conv layer
      cnn_strides: list of strides in conv layer
      cnn_use_bias: list of use_bias in conv layer
  Returns:
    Keras model
  """

  if not all(
      len(cnn_filters) == len(l) for l in [
          cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides,
          cnn_use_bias
      ]):
    raise ValueError('all input lists have to be the same length')

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)
  net = input_audio

  net = tf.keras.backend.expand_dims(net)

  for filters, kernel_size, activation, dilation_rate, strides, use_bias in zip(
      cnn_filters, cnn_kernel_size,
      cnn_act, cnn_dilation_rate,
      cnn_strides, cnn_use_bias):

    ring_buffer_size_in_time_dim = dilation_rate * (kernel_size - 1)
    net = stream.Stream(
        cell=tf.identity,
        ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim,
        use_one_step=False,
        pad_time_dim=None)(net)

    padding_size = ring_buffer_size_in_time_dim
    net = temporal_padding.TemporalPadding(
        padding='causal', padding_size=padding_size)(
            net)

    net = conv_cell(
        filters=filters,
        kernel_size=kernel_size,
        activation=activation,
        dilation_rate=dilation_rate,
        strides=strides,
        use_bias=use_bias,
        padding='valid')(net)  # padding has to be valid!

  return tf.keras.Model(input_audio, net)
Exemple #3
0
    def test_padding_and_cropping(self, padding, padding_size):
        batch_size = 1
        time_dim = 10
        feature_dim = 3
        inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim),
                                       batch_size=batch_size)
        net = temporal_padding.TemporalPadding(
            padding=padding, padding_size=padding_size)(inputs)
        model = tf.keras.Model(inputs, net)

        np.random.seed(1)
        input_signal = np.random.rand(batch_size, time_dim, feature_dim)
        output_signal = model.predict(input_signal)
        if padding_size >= 0:
            reference_padding = {
                'causal': (padding_size, 0),
                'same': (padding_size // 2, padding_size - padding_size // 2),
                'future': (0, padding_size),
            }[padding]
            output_reference = tf.keras.backend.temporal_padding(
                input_signal, padding=reference_padding)
        else:
            reference_cropping = {
                'causal': (-padding_size, 0),
                'same':
                ((-padding_size) // 2, -padding_size - (-padding_size) // 2),
                'future': (0, -padding_size),
            }[padding]
            output_reference = tf.keras.layers.Cropping1D(reference_cropping)(
                input_signal)
        self.assertAllClose(output_signal, output_reference)
        self.assertAllEqual(output_signal.shape,
                            [batch_size, time_dim + padding_size, feature_dim])
Exemple #4
0
 def test_no_padding_or_cropping_in_streaming(self, padding, padding_size):
     batch_size = 1
     feature_dim = 3
     mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
     inputs = tf.keras.layers.Input(shape=(1, feature_dim),
                                    batch_size=batch_size)
     net = temporal_padding.TemporalPadding(padding=padding,
                                            padding_size=padding_size,
                                            mode=mode)(inputs)
     self.assertAllEqual(tf.keras.backend.int_shape(net),
                         [batch_size, 1, feature_dim])
Exemple #5
0
    def __init__(self,
                 mode=modes.Modes.TRAINING,
                 inference_batch_size=1,
                 frame_size=400,
                 frame_step=160,
                 use_one_step=True,
                 padding=None,
                 **kwargs):
        super(DataFrame, self).__init__(**kwargs)

        if use_one_step and frame_step > frame_size:
            raise ValueError('frame_step:%d must be <= frame_size:%d' %
                             (frame_step, frame_size))

        if padding and padding != 'causal':
            raise ValueError('only causal padding is supported')

        self.mode = mode
        self.inference_batch_size = inference_batch_size
        self.frame_size = frame_size
        self.frame_step = frame_step
        self.use_one_step = use_one_step
        self.padding = padding

        if self.use_one_step:
            self.ring_buffer_size_in_time_dim = frame_size
        else:
            self.ring_buffer_size_in_time_dim = frame_size - 1

        if self.padding:
            self.padding_layer = temporal_padding.TemporalPadding(
                padding_size=self.ring_buffer_size_in_time_dim,
                padding=self.padding)
        else:
            self.padding_layer = tf.keras.layers.Lambda(lambda x: x)

        if self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE:
            # create state varaible for inference streaming with internal state
            self.states = self.add_weight(name='frame_states',
                                          shape=[
                                              self.inference_batch_size,
                                              self.ring_buffer_size_in_time_dim
                                          ],
                                          trainable=False,
                                          initializer=tf.zeros_initializer)
        elif self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE:
            # in streaming mode with external state,
            # state becomes an input output placeholders
            self.input_state = tf.keras.layers.Input(
                shape=(self.ring_buffer_size_in_time_dim, ),
                batch_size=self.inference_batch_size,
                name=self.name + 'input_state')
            self.output_state = None
    def test_padding(self, padding_size):
        batch_size = 1
        time_dim = 3
        feature_dim = 3
        padding = 'causal'
        inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim),
                                       batch_size=batch_size)
        net = temporal_padding.TemporalPadding(
            padding=padding, padding_size=padding_size)(inputs)
        model = tf.keras.Model(inputs, net)

        np.random.seed(1)
        input_signal = np.random.rand(batch_size, time_dim, feature_dim)
        output_signal = model.predict(input_signal)
        output_reference = tf.keras.backend.temporal_padding(
            input_signal, padding=(padding_size, 0))
        self.assertAllClose(output_signal, output_reference)
        self.assertAllEqual(output_signal.shape,
                            [batch_size, time_dim + padding_size, feature_dim])
    def setUp(self):
        super(STFTTest, self).setUp()
        test_utils.set_seed(123)

        self.frame_size = 40
        self.frame_step = 10
        # layer definition
        stft_layer = stft.STFT(self.frame_size,
                               self.frame_step,
                               mode=modes.Modes.TRAINING,
                               inference_batch_size=1,
                               padding='causal')

        if stft_layer.window_type == 'hann_tf':
            synthesis_window_fn = tf.signal.hann_window
        else:
            synthesis_window_fn = None

        # prepare input data
        self.input_signal = np.random.rand(1, 120)

        # prepare default tf stft
        padding_layer = temporal_padding.TemporalPadding(
            padding_size=stft_layer.frame_size - 1, padding=stft_layer.padding)
        # pylint: disable=g-long-lambda
        stft_default_layer = tf.keras.layers.Lambda(
            lambda x: tf.signal.stft(x,
                                     stft_layer.frame_size,
                                     stft_layer.frame_step,
                                     fft_length=stft_layer.fft_size,
                                     window_fn=synthesis_window_fn,
                                     pad_end=False))
        # pylint: enable=g-long-lambda
        input_tf = tf.keras.layers.Input(shape=(self.input_signal.shape[1], ),
                                         batch_size=1)
        net = padding_layer(input_tf)
        net = stft_default_layer(net)

        model_stft = tf.keras.models.Model(input_tf, net)

        self.stft_out = model_stft.predict(self.input_signal)
def residual_model(flags,
                   cnn_filters,
                   cnn_kernel_size,
                   cnn_act,
                   cnn_use_bias,
                   cnn_padding,
                   delay_also_in_non_streaming,
                   dilation=1):
    """Toy deep convolutional model with residual connections.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer
      delay_also_in_non_streaming: Whether to apply delay also in non-streaming.
      dilation: dilation applied on all conv layers

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    sum_delay = 0
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):
        time_buffer_size = dilation * (kernel_size - 1)

        if padding == 'causal':
            # residual connection is simple with 'causal'  padding
            net_residual = net

        elif padding == 'same':
            # residual connection in streaming mode needs delay with 'same' padding
            delay_val = time_buffer_size // 2
            net_residual = delay.Delay(
                delay=delay_val,
                also_in_non_streaming=delay_also_in_non_streaming)(net)
            sum_delay += delay_val

        else:
            raise ValueError('wrong padding mode ', padding)

        # it is easier to convert model to streaming mode when padding function
        # is decoupled from conv layer
        net = temporal_padding.TemporalPadding(
            padding='causal' if delay_also_in_non_streaming else padding,
            padding_size=time_buffer_size)(net)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(cell=tf.identity,
                            ring_buffer_size_in_time_dim=time_buffer_size,
                            use_one_step=False,
                            pad_time_dim=None)(net)

        net = tf.keras.layers.Conv1D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     use_bias=use_bias,
                                     padding='valid')(
                                         net)  # padding has to be valid!

        net = tf.keras.layers.Add()([net, net_residual])

    return tf.keras.Model(input_audio, net), sum_delay
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias,
                   cnn_padding):
    """Toy deep convolutional model with residual connections.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer

  Returns:
    Keras model

  Raises:
    ValueError: if any of input list has different length from any other
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):

        ring_buffer_size_in_time_dim = (kernel_size - 1)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(
            cell=tf.identity,
            ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim,
            use_one_step=False,
            pad_time_dim=None)(net)

        # residual connection in streaming mode needs:
        # * kernel size in time dim of conv layer
        # * padding mode which was used to padd data in time dim
        net_residual = residual.Residual(
            padding=padding,
            kernel_size_time=ring_buffer_size_in_time_dim + 1)(net)

        # it is easier to convert model to streaming mode when padding function
        # is decoupled from conv layer
        net = temporal_padding.TemporalPadding(
            padding=padding, padding_size=ring_buffer_size_in_time_dim)(net)

        net = tf.keras.layers.Conv1D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     use_bias=use_bias,
                                     padding='valid')(
                                         net)  # padding has to be valid!

        net = tf.keras.layers.Add()([net, net_residual])

    return tf.keras.Model(input_audio, net)