Exemple #1
0
def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2):
    
    assert data_settings.window_size_ms == 30.0
    assert data_settings.window_stride_ms == 10.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.background_volume == 0.4
    assert data_settings.mel_upper_edge_hertz == 7000
    assert data_settings.wanted_words == 'marvin'
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4')

    return model
Exemple #2
0
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5):
    data_settings.window_size_ms = 40.0
    data_settings.window_stride_ms = 20.0
    data_settings.dct_num_features = 40
    data_settings.mel_num_bins = 80
    data_settings.mel_upper_edge_hertz = 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7')

    return model
Exemple #3
0
    def test_streaming_inference_internal_state(self):
        output_non_stream_np, _ = self._run_non_stream_model()

        mode = Modes.STREAM_INTERNAL_STATE_INFERENCE
        input_tf = tf.keras.layers.Input(shape=(
            1,
            self.input_data.shape[2],
        ))

        svdf_layer = svdf.Svdf(units1=self.weights[0].shape[1],
                               memory_size=self.memory_size,
                               units2=self.weights[3].shape[1],
                               activation="linear",
                               inference_batch_size=self.batch_size,
                               mode=mode)
        output_tf = svdf_layer(inputs=input_tf)

        input_states_np = np.zeros(
            [self.batch_size, self.memory_size, self.weights[1].shape[-1]])

        svdf_layer.dense1.set_weights([self.weights[0]])
        svdf_layer.depth_cnn1.set_weights(
            [self.weights[1], self.weights[2], input_states_np])
        svdf_layer.dense2.set_weights([self.weights[3], self.weights[4]])
        model = tf.keras.models.Model(input_tf, output_tf)

        for i in range(
                self.input_data.shape[1]):  # loop over every element in time
            input_batch_np = self.input_data[:, i, :]
            input_batch_np = np.expand_dims(input_batch_np, 1)
            output_np = model.predict(input_batch_np)
            for b in range(self.input_data.shape[0]):  # loop over batch
                self.assertAllClose(output_np[b][0],
                                    output_non_stream_np[b][i])
Exemple #4
0
    def _run_non_stream_model(self):
        # below model expects that input_data are already initialized in tu.TestBase
        # in setUp, by default input_data should have 3 dimensions.
        # size of each dimesnion is constant and is defiend by self.weights
        mode = Modes.TRAINING
        input_tf = tf.keras.layers.Input(shape=(
            None,
            self.input_data.shape[2],
        ))

        svdf_layer = svdf.Svdf(units1=self.weights[0].shape[1],
                               memory_size=self.memory_size,
                               units2=self.weights[3].shape[1],
                               activation="linear",
                               inference_batch_size=self.batch_size,
                               mode=mode)
        output_tf = svdf_layer(inputs=input_tf)
        svdf_layer.dense1.set_weights([self.weights[0]])
        svdf_layer.depth_cnn1.set_weights([self.weights[1], self.weights[2]])
        svdf_layer.dense2.set_weights([self.weights[3], self.weights[4]])

        model_tf = tf.keras.models.Model(input_tf, output_tf)

        # run inference in non streaming mode
        output_non_stream_np = model_tf.predict(self.input_data)
        return output_non_stream_np, model_tf
Exemple #5
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      feature_type=flags.feature_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size),
          utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout),
          utils.parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=padding,
        name='svdf_%d' % i)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Exemple #7
0
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2):
    X_input = tf.keras.Input(input_shape)
    X = speech_features.SpeechFeatures(
        frame_size_ms=data_settings.window_size_ms,
        frame_step_ms=data_settings.window_stride_ms)(X_input)

    X = svdf.Svdf(units1=256,
                  memory_size=8,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_1')(X)

    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_2')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_3')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_4')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_5')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=-1,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_6')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)

    # Create model
    model = tf.keras.models.Model(inputs=X_input,
                                  outputs=X,
                                  name='E2E_1stage_v2')

    return model
Exemple #8
0
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5):
    assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine'
    assert data_settings.window_size_ms == 40.0
    assert data_settings.window_stride_ms == 20.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.mel_upper_edge_hertz == 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=192, memory_size = 4, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_6')(X)


    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9')

    return model
Exemple #9
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      speech_features.SpeechFeatures.get_params(flags))(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Exemple #10
0
def model(flags):
    """SVDF model with residual connections.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  In addition we added residual connection
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    blocks_pool = parse(flags.blocks_pool)
    if len(blocks_pool) != 3:
        raise ValueError('number of pooling blocks has to be 3, but get: ',
                         len(blocks_pool))

    # for streaming mode it is better to use causal padding
    padding = 'causal' if flags.svdf_pad else 'valid'

    # first residual block
    number_of_blocks = len(parse(flags.block1_units1))
    activations = [flags.activation] * number_of_blocks
    activations[-1] = 'linear'  # last layer is linear
    residual = net
    for i, (units1, memory_size, activation) in enumerate(
            zip(parse(flags.block1_units1), parse(flags.block1_memory_size),
                activations)):
        # [batch, time, feature]
        net = svdf.Svdf(units1=units1,
                        memory_size=memory_size,
                        units2=-1,
                        dropout=flags.svdf_dropout,
                        activation=activation,
                        pad=padding,
                        use_bias=flags.svdf_use_bias,
                        use_batch_norm=flags.use_batch_norm,
                        bn_scale=flags.bn_scale,
                        name='svdf_1_%d' % i)(net)

    # number of channels in the last layer
    units1_last = parse(flags.block1_units1)[-1]

    # equivalent to 1x1 convolution
    residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
    residual = tf.keras.layers.BatchNormalization(
        scale=flags.bn_scale)(residual)

    # residual connection
    net = tf.keras.layers.Add()([net, residual])
    # [batch, time, feature]
    net = tf.keras.layers.Activation(flags.activation)(net)
    net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[0],
                                    padding='valid')(net)

    # second residual block
    number_of_blocks = len(parse(flags.block2_units1))
    activations = [flags.activation] * number_of_blocks
    activations[-1] = 'linear'  # last layer is linear
    residual = net
    for i, (units1, memory_size, activation) in enumerate(
            zip(parse(flags.block2_units1), parse(flags.block2_memory_size),
                activations)):
        # [batch, time, feature]
        net = svdf.Svdf(units1=units1,
                        memory_size=memory_size,
                        units2=-1,
                        dropout=flags.svdf_dropout,
                        activation=activation,
                        pad=padding,
                        use_bias=flags.svdf_use_bias,
                        use_batch_norm=flags.use_batch_norm,
                        bn_scale=flags.bn_scale,
                        name='svdf_2_%d' % i)(net)

    # number of channels in the last layer
    units1_last = parse(flags.block2_units1)[-1]

    # equivalent to 1x1 convolution
    residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
    residual = tf.keras.layers.BatchNormalization(
        scale=flags.bn_scale)(residual)

    # residual connection
    net = tf.keras.layers.Add()([net, residual])
    net = tf.keras.layers.Activation(flags.activation)(net)
    # [batch, time, feature]
    net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[1],
                                    padding='valid')(net)

    # third residual block
    number_of_blocks = len(parse(flags.block3_units1))
    activations = [flags.activation] * number_of_blocks
    activations[-1] = 'linear'  # last layer is linear
    residual = net
    for i, (units1, memory_size, activation) in enumerate(
            zip(parse(flags.block3_units1), parse(flags.block3_memory_size),
                activations)):
        net = svdf.Svdf(units1=units1,
                        memory_size=memory_size,
                        units2=-1,
                        dropout=flags.svdf_dropout,
                        activation=activation,
                        pad=padding,
                        use_bias=flags.svdf_use_bias,
                        use_batch_norm=flags.use_batch_norm,
                        bn_scale=flags.bn_scale,
                        name='svdf_3_%d' % i)(net)

    # number of channels in the last layer
    units1_last = parse(flags.block3_units1)[-1]

    # equivalent to 1x1 convolution
    residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
    residual = tf.keras.layers.BatchNormalization(
        scale=flags.bn_scale)(residual)

    # residual connection
    net = tf.keras.layers.Add()([net, residual])
    net = tf.keras.layers.Activation(flags.activation)(net)
    net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[2],
                                    padding='valid')(net)
    # [batch, time, feature]

    # convert all feature to one vector
    if flags.flatten:
        net = tf.keras.layers.Flatten()(net)
    else:
        net = tf.keras.layers.GlobalAveragePooling1D()(net)

    # [batch, feature]
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units in parse(flags.units2):
        net = tf.keras.layers.Dense(units=units,
                                    activation=flags.activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)