コード例 #1
0
ファイル: delay_test.py プロジェクト: zb8c5ek/google-research
def delay_model(flags, time_delay):
    """Model with delay for streaming mode.

  Args:
      flags: model and data settings
      time_delay: delay in time dim

  Returns:
    Keras model
  """

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio
    net = tf.keras.backend.expand_dims(net)
    net = delay.Delay(delay=time_delay)(net)
    return tf.keras.Model(input_audio, net)
コード例 #2
0
def residual_model(flags,
                   cnn_filters,
                   cnn_kernel_size,
                   cnn_act,
                   cnn_use_bias,
                   cnn_padding,
                   delay_also_in_non_streaming,
                   dilation=1):
    """Toy deep convolutional model with residual connections.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer
      delay_also_in_non_streaming: Whether to apply delay also in non-streaming.
      dilation: dilation applied on all conv layers

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    sum_delay = 0
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):
        time_buffer_size = dilation * (kernel_size - 1)

        if padding == 'causal':
            # residual connection is simple with 'causal'  padding
            net_residual = net

        elif padding == 'same':
            # residual connection in streaming mode needs delay with 'same' padding
            delay_val = time_buffer_size // 2
            net_residual = delay.Delay(
                delay=delay_val,
                also_in_non_streaming=delay_also_in_non_streaming)(net)
            sum_delay += delay_val

        else:
            raise ValueError('wrong padding mode ', padding)

        # it is easier to convert model to streaming mode when padding function
        # is decoupled from conv layer
        net = temporal_padding.TemporalPadding(
            padding='causal' if delay_also_in_non_streaming else padding,
            padding_size=time_buffer_size)(net)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(cell=tf.identity,
                            ring_buffer_size_in_time_dim=time_buffer_size,
                            use_one_step=False,
                            pad_time_dim=None)(net)

        net = tf.keras.layers.Conv1D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     use_bias=use_bias,
                                     padding='valid')(
                                         net)  # padding has to be valid!

        net = tf.keras.layers.Add()([net, net_residual])

    return tf.keras.Model(input_audio, net), sum_delay
コード例 #3
0
def conv_model(flags,
               cnn_filters,
               cnn_kernel_size,
               cnn_act,
               cnn_use_bias,
               cnn_padding,
               dilation=1):
    """Toy convolutional model with sequence of convs with different paddings.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer
      dilation: dilation applied on all conv layers

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    sum_delay = 0
    sum_shift = 0
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):
        time_buffer_size = dilation * (kernel_size - 1)

        if padding == 'same':
            # need a delay with 'same' padding in streaming mode
            delay_val = time_buffer_size // 2
            net = delay.Delay(delay=delay_val)(net)
            sum_delay += delay_val * 2
        elif padding == 'causal':
            sum_shift += kernel_size
        else:
            raise ValueError('wrong padding mode ', padding)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(cell=tf.keras.layers.Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            use_bias=use_bias,
            padding='valid'),
                            use_one_step=False,
                            pad_time_dim=padding)(net)

    return tf.keras.Model(input_audio, net), sum_delay, sum_shift
コード例 #4
0
def transposed_conv_model(flags,
                          cnn_filters,
                          cnn_kernel_size,
                          cnn_act,
                          cnn_use_bias,
                          cnn_paddings,
                          trans_paddings):
  """Toy deep convolutional model with transposed convolutions.

  It can be used for speech enhancement.

  Args:
    flags: model and data settings
    cnn_filters: list of filters for conv layer
    cnn_kernel_size: list of kernel_size for conv layer
    cnn_act: list of activation functions for conv layer
    cnn_use_bias: list of use_bias for conv layer
    cnn_paddings: list of padding for conv layer
    trans_paddings: list of padding for transposed conv layer

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

  if not all(
      len(cnn_filters) == len(l) for l in [
          cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings,
          trans_paddings
      ]):
    raise ValueError('all input lists have to be the same length')

  # it is an example of deep conv model for speech enhancement
  # which can be trained in non streaming mode and converted to streaming mode
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)
  net = input_audio

  net = tf.keras.backend.expand_dims(net)
  for filters, kernel_size, activation, use_bias, padding, trans_padding in zip(
      cnn_filters, cnn_kernel_size,
      cnn_act, cnn_use_bias, cnn_paddings, trans_paddings):
    time_buffer_size = kernel_size - 1

    net = tf.keras.backend.expand_dims(net, axis=-2)
    net = stream.Stream(
        cell=tf.keras.layers.Conv2DTranspose(
            filters=filters, kernel_size=(3, 1),
            strides=(2, 1), padding='valid'),
        pad_time_dim=trans_padding)(net)
    net = tf.keras.backend.squeeze(net, axis=-2)

    if padding == 'same':
      # model looking into future, so introducing delay for streaming mode
      net = delay.Delay(delay=time_buffer_size // 2)(net)
    elif padding != 'causal':
      raise ValueError('wrong padding mode ', padding)

    # it is a ring buffer in streaming mode and lambda x during training
    net = stream.Stream(
        cell=tf.keras.layers.Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            use_bias=use_bias,
            padding='valid'),
        use_one_step=False,
        pad_time_dim=padding)(net)

  return tf.keras.Model(input_audio, net)