Beispiel #1
0
    def test_tf_non_streaming_vs_streaming_inference_internal_state(self):
        """Tests non stream inference vs stream inference with internal state."""
        speech_params = speech_features.SpeechFeatures.get_params(self.params)
        mode = modes.Modes.NON_STREAM_INFERENCE
        # TF non streaming frame extraction based on tf.signal.frame
        mel_speech_tf = speech_features.SpeechFeatures(
            speech_params, mode, self.inference_batch_size)
        # it receives all data with size: data_size
        input1 = tf.keras.layers.Input(shape=(self.data_size, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output1 = mel_speech_tf(input1)
        model_tf = tf.keras.models.Model(input1, output1)

        # generate frames for the whole signal (no streaming here)
        output_tf = model_tf.predict(self.signal)

        # streaming frame extraction
        # it receives input data incrementally with step: frame_step
        mode = modes.Modes.STREAM_INTERNAL_STATE_INFERENCE
        mel_speech_stream = speech_features.SpeechFeatures(
            speech_params, mode, self.inference_batch_size)
        input2 = tf.keras.layers.Input(shape=(self.frame_step, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output2 = mel_speech_stream(input2)

        # initialize state of streaming model
        pre_state = self.signal[:, 0:mel_speech_stream.data_frame.frame_size -
                                mel_speech_stream.data_frame.frame_step]
        state_init = np.concatenate(
            (np.zeros(shape=(1, mel_speech_stream.data_frame.frame_step),
                      dtype=np.float32), pre_state),
            axis=1)
        mel_speech_stream.data_frame.set_weights([state_init])
        model_stream = tf.keras.models.Model(input2, output2)

        # run streaming frames extraction
        start = self.frame_size - self.frame_step
        end = self.frame_size
        streamed_frames = []
        while end <= self.data_size:
            # next data update
            stream_update = self.signal[:, start:end]

            # get new frame from stream of data
            output_frame = model_stream.predict(stream_update)
            streamed_frames.append(output_frame)

            # update indexes of streamed updates
            start = end
            end = start + self.frame_step

        self.assertNotEmpty(streamed_frames)
        # compare streaming vs non streaming frames extraction
        for i in range(len(streamed_frames)):
            self.assertAllClose(streamed_frames[i][0][0],
                                output_tf[0][i],
                                rtol=1e-4,
                                atol=1e-4)
Beispiel #2
0
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2):
    X_input = tf.keras.Input(input_shape)
    X = speech_features.SpeechFeatures(
        frame_size_ms=data_settings.window_size_ms,
        frame_step_ms=data_settings.window_stride_ms)(X_input)

    X = svdf.Svdf(units1=256,
                  memory_size=8,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_1')(X)

    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_2')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_3')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_4')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_5')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=-1,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_6')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)

    # Create model
    model = tf.keras.models.Model(inputs=X_input,
                                  outputs=X,
                                  name='E2E_1stage_v2')

    return model
Beispiel #3
0
    def test_tf_non_streaming_vs_streaming_inference_external_state(self):
        """Tests non stream inference vs stream inference with external state."""
        speech_params = speech_features.SpeechFeatures.get_params(self.params)
        mode = modes.Modes.NON_STREAM_INFERENCE
        # TF non streaming frame extraction based on tf.signal.frame
        mel_speech_tf = speech_features.SpeechFeatures(
            speech_params, mode, self.inference_batch_size)
        # it receives all data with size: data_size
        input1 = tf.keras.layers.Input(shape=(self.data_size, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output1 = mel_speech_tf(input1)
        model_tf = tf.keras.models.Model(input1, output1)

        # generate frames for the whole signal (no streaming here)
        output_tf = model_tf.predict(self.signal)

        # input data for streaming mode
        input_tensors = [
            tf.keras.layers.Input(shape=(self.frame_step, ),
                                  batch_size=self.inference_batch_size,
                                  dtype=tf.float32)
        ]

        # convert non streaming trainable model to
        # streaming inference with external state
        mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
        model_stream = utils.convert_to_inference_model(
            model_tf, input_tensors, mode)

        # initialize state of streaming model
        pre_state = self.signal[:, 0:self.frame_size - self.frame_step]
        state2 = np.concatenate((np.zeros(shape=(1, self.frame_step),
                                          dtype=np.float32), pre_state),
                                axis=1)

        # run streaming frames extraction
        start = self.frame_size - self.frame_step
        end = self.frame_size
        streamed_frames = []
        while end <= self.data_size:
            # next data update
            stream_update = self.signal[:, start:end]

            # get new frame from stream of data
            output_frame, output_state = model_stream.predict(
                [stream_update, state2])
            state2 = output_state
            streamed_frames.append(output_frame)

            # update indexes of streamed updates
            start = end
            end = start + self.frame_step

        # compare streaming vs non streaming frames extraction
        for i in range(len(streamed_frames)):
            self.assertAllClose(streamed_frames[i][0][0],
                                output_tf[0][i],
                                rtol=1e-4,
                                atol=1e-4)
Beispiel #4
0
def model(flags):
    """Convolutional recurrent neural network (CRNN) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf
  Represented as sequence of Conv, RNN/GRU, FC layers.
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # expand dims for the next layer 2d conv
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = stream.Stream(
            cell=tf.keras.layers.Conv2D(filters=filters,
                                        kernel_size=kernel_size,
                                        activation=activation,
                                        dilation_rate=dilation_rate,
                                        strides=strides))(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = gru.GRU(units=units,
                      return_sequences=return_sequences,
                      stateful=flags.stateful)(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Beispiel #5
0
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5):
    assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine'
    assert data_settings.window_size_ms == 40.0
    assert data_settings.window_stride_ms == 20.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.mel_upper_edge_hertz == 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=192, memory_size = 4, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_6')(X)


    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9')

    return model
Beispiel #6
0
def model(flags):
  """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)

  return tf.keras.Model(input_audio, net)
Beispiel #7
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      feature_type=flags.feature_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Beispiel #8
0
def model(flags):
  """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      feature_type=flags.feature_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  net = tf.keras.backend.expand_dims(net)
  for filters, kernel_size, activation, dilation_rate, strides in zip(
      parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
      parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
      parse(flags.cnn_strides)):
    net = Stream(
        cell=tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            dilation_rate=dilation_rate,
            strides=strides))(
                net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Beispiel #9
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size),
          utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout),
          utils.parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=padding,
        name='svdf_%d' % i)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Beispiel #10
0
def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2):
    
    assert data_settings.window_size_ms == 30.0
    assert data_settings.window_stride_ms == 10.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.background_volume == 0.4
    assert data_settings.mel_upper_edge_hertz == 7000
    assert data_settings.wanted_words == 'marvin'
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4')

    return model
Beispiel #11
0
def model(flags):
    """Temporal Convolution ResNet model.

  It can be configured to reproduce model config as described in the paper below
  Temporal Convolution for Real-time Keyword Spotting on Mobile Devices
  https://arxiv.org/pdf/1904.03814.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    tc_filters = parse(flags.tc_filters)
    repeat_tc_convs = parse(flags.repeat_tc_convs)
    kernel_sizes = parse(flags.kernel_sizes)
    pool_sizes = parse(flags.pool_sizes)
    dilations = parse(flags.dilations)
    residuals = parse(flags.residuals)

    if len(
            set((len(repeat_tc_convs), len(kernel_sizes), len(pool_sizes),
                 len(dilations), len(residuals), len(tc_filters)))) != 1:
        raise ValueError('all input lists have to be the same length')

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, 1, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)

    for filters, repeat, kernel_size, pool_size, dilation, residual in zip(
            tc_filters, repeat_tc_convs, kernel_sizes, pool_sizes, dilations,
            residuals):
        net = resnet_block(net, repeat, kernel_size, filters, dilation,
                           residual, flags.padding_in_time, flags.dropout,
                           flags.activation)

        if pool_size > 1:
            net = tf.keras.layers.MaxPooling2D((pool_size, 1))(net)

    net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Beispiel #12
0
def model(flags):
    """Inception resnet model.

  It is based on paper:
  Inception-v4, Inception-ResNet and the Impact of
     Residual Connections on Learning https://arxiv.org/abs/1602.07261
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=-1)
    # [batch, time, feature, 1]

    for filters in utils.parse(flags.cnn_filters0):
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='valid',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net)
        # [batch, time, feature, filters]

    for stride, scale, filters_branch0, filters_branch1 in zip(
            utils.parse(flags.strides), utils.parse(flags.scales),
            utils.parse(flags.filters_branch0),
            utils.parse(flags.filters_branch1)):
        net = inception_resnet_block(net,
                                     scale,
                                     filters_branch0,
                                     filters_branch1,
                                     bn_scale=flags.bn_scale)
        net = tf.keras.layers.MaxPooling2D(3, strides=stride,
                                           padding='valid')(net)
        # [batch, time, feature, filters]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #13
0
def model(flags):
  """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Beispiel #14
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for units, return_sequences, num_proj in zip(
      parse(flags.lstm_units), parse(flags.return_sequences),
      parse(flags.num_proj)):
    net = LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Beispiel #15
0
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5):
    data_settings.window_size_ms = 40.0
    data_settings.window_stride_ms = 20.0
    data_settings.dct_num_features = 40
    data_settings.mel_num_bins = 80
    data_settings.mel_upper_edge_hertz = 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7')

    return model
Beispiel #16
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = stream.Stream(
            cell=tf.keras.layers.Conv2D(filters=filters,
                                        kernel_size=kernel_size,
                                        activation=activation,
                                        dilation_rate=dilation_rate,
                                        strides=strides))(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Beispiel #17
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, return_sequences, num_proj in zip(
      utils.parse(flags.lstm_units), utils.parse(flags.return_sequences),
      utils.parse(flags.num_proj)):
    net = lstm.LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Beispiel #18
0
def model(flags):
    """LSTM model.

  It is based on paper https://arxiv.org/pdf/1705.02411.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        frame_size_ms=flags.window_size_ms,
        frame_step_ms=flags.window_stride_ms,
        sample_rate=flags.sample_rate,
        use_tf_fft=flags.use_tf_fft,
        preemph=flags.preemph,
        window_type=flags.window_type,
        mel_num_bins=flags.mel_num_bins,
        mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
        mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
        mel_non_zero_only=flags.mel_non_zero_only,
        fft_magnitude_squared=flags.fft_magnitude_squared,
        dct_num_features=flags.dct_num_features)(input_audio)

    for units, return_sequences, num_proj in zip(parse(flags.lstm_units),
                                                 parse(flags.return_sequences),
                                                 parse(flags.num_proj)):
        net = LSTM(units=units,
                   return_sequences=return_sequences,
                   stateful=flags.stateful,
                   use_peepholes=flags.use_peepholes,
                   num_proj=num_proj)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #19
0
    def test_tf_non_streaming_train(self):
        """Tests non stream inference with train flag."""
        params = Params()
        params.sp_time_shift_ms = 10.0
        speech_params = speech_features.SpeechFeatures.get_params(params)
        mode = modes.Modes.TRAINING
        # TF non streaming frame extraction based on tf.signal.frame
        mel_speech_tf = speech_features.SpeechFeatures(
            speech_params, mode, self.inference_batch_size)
        # it receives all data with size: data_size
        input1 = tf.keras.layers.Input(shape=(self.data_size, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output1 = mel_speech_tf(input1)
        model_tf = tf.keras.models.Model(input1, output1)

        # generate frames for the whole signal (no streaming here)
        self.assertNotEmpty(model_tf.predict(self.signal))
Beispiel #20
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      speech_features.SpeechFeatures.get_params(flags))(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Beispiel #21
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      speech_features.SpeechFeatures.get_params(flags))(
          input_audio)

  for units, return_sequences, num_proj in zip(
      parse(flags.lstm_units), parse(flags.return_sequences),
      parse(flags.num_proj)):
    net = LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Beispiel #22
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(input_audio)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                 kernel_size=kernel_size,
                                                 activation=activation,
                                                 dilation_rate=dilation_rate,
                                                 strides=strides))(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #23
0
def model(flags):
    """Gated Recurrent Unit(GRU) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = GRU(units=units,
                  return_sequences=return_sequences,
                  stateful=flags.stateful)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #24
0
def model(flags):
  """Mobilenet model.

  It is based on paper:
  MobileNets: Efficient Convolutional Neural Networks for
     Mobile Vision Applications https://arxiv.org/abs/1704.04861
  It is applied on sequence in time, so only 1D filters applied
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # [batch, time, feature]
  net = tf.keras.backend.expand_dims(net, axis=2)
  # [batch, time, feature, 1]

  # it is convolutional block
  net = tf.keras.layers.Conv2D(
      filters=flags.cnn1_filters,
      kernel_size=utils.parse(flags.cnn1_kernel_size),
      padding='valid',
      use_bias=False,
      strides=utils.parse(flags.cnn1_strides))(
          net)
  net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
  net = tf.keras.layers.ReLU(6.)(net)
  # [batch, time, feature, filters]

  for kernel_size, strides, filters in zip(
      utils.parse(flags.ds_kernel_size), utils.parse(flags.ds_strides),
      utils.parse(flags.cnn_filters)):
    # it is depthwise convolutional block
    net = tf.keras.layers.DepthwiseConv2D(
        kernel_size,
        padding='same' if strides == (1, 1) else 'valid',
        depth_multiplier=1,
        strides=strides,
        use_bias=False)(
            net)
    net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
    net = tf.keras.layers.ReLU(6.,)(net)

    net = tf.keras.layers.Conv2D(
        filters=filters, kernel_size=(1, 1),
        padding='same',
        use_bias=False,
        strides=(1, 1))(net)
    net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
    net = tf.keras.layers.ReLU(6.)(net)
    # [batch, time, feature, filters]

  net = tf.keras.layers.GlobalAveragePooling2D()(net)
  # [batch, filters]
  net = tf.keras.layers.Dropout(flags.dropout)(net)
  net = tf.keras.layers.Dense(flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  # [batch, label_count]
  return tf.keras.Model(input_audio, net)
Beispiel #25
0
def model(flags):
    """Temporal Convolution ResNet model.

  It is based on paper:
  Temporal Convolution for Real-time Keyword Spotting on Mobile Devices
  https://arxiv.org/pdf/1904.03814.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    time_size, feature_size = net.shape[1:3]

    channels = utils.parse(flags.channels)

    net = tf.keras.backend.expand_dims(net)

    if flags.debug_2d:
        conv_kernel = first_conv_kernel = (3, 3)
    else:
        net = tf.reshape(
            net, [-1, time_size, 1, feature_size])  # [batch, time, 1, feature]
        first_conv_kernel = (3, 1)
        conv_kernel = utils.parse(flags.kernel_size)

    net = tf.keras.layers.Conv2D(filters=channels[0],
                                 kernel_size=first_conv_kernel,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    if utils.parse(flags.pool_size):
        net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse(
            flags.pool_size),
                                               strides=flags.pool_stride)(net)

    channels = channels[1:]

    # residual blocks
    for n in channels:
        if n != net.shape[-1]:
            stride = 2
            layer_in = tf.keras.layers.Conv2D(filters=n,
                                              kernel_size=1,
                                              strides=stride,
                                              padding='same',
                                              activation='linear')(net)
            layer_in = tf.keras.layers.BatchNormalization(
                momentum=flags.bn_momentum,
                center=flags.bn_center,
                scale=flags.bn_scale,
                renorm=flags.bn_renorm)(layer_in)
            layer_in = tf.keras.layers.Activation('relu')(layer_in)
        else:
            layer_in = net
            stride = 1

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=stride,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation('relu')(net)

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=1,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)

        # residual connection
        net = tf.keras.layers.Add()([net, layer_in])
        net = tf.keras.layers.Activation('relu')(net)

    net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3],
                                           strides=1)(net)

    net = tf.keras.layers.Dropout(rate=flags.dropout)(net)

    # fully connected layer
    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)

    net = tf.reshape(net, shape=(-1, net.shape[3]))
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Beispiel #26
0
def model(flags):
    """Inception model.

  It is based on paper:
  Rethinking the Inception Architecture for Computer Vision
      http://arxiv.org/abs/1512.00567
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=-1)
    # [batch, time, feature, 1]

    for filters in utils.parse(flags.cnn_filters0):
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='valid',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net)
        # [batch, time, feature, filters]

    filters = utils.parse(flags.cnn_filters0)[-1]
    net = utils.conv2d_bn(net,
                          filters, (3, 1),
                          padding='valid',
                          scale=flags.bn_scale)
    net = utils.conv2d_bn(net,
                          filters, (1, 3),
                          padding='valid',
                          scale=flags.bn_scale)

    for stride, filters1, filters2 in zip(utils.parse(flags.cnn_strides),
                                          utils.parse(flags.cnn_filters1),
                                          utils.parse(flags.cnn_filters2)):

        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 3), strides=stride)(net)

        branch1 = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale)

        branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters2, (1, 3),
                                  scale=flags.bn_scale)

        branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (1, 3),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters2, (1, 3),
                                  scale=flags.bn_scale)

        branch4 = tf.keras.layers.AveragePooling2D((3, 3),
                                                   strides=(1, 1),
                                                   padding='same')(net)
        branch4 = utils.conv2d_bn(branch4,
                                  filters2, (1, 1),
                                  scale=flags.bn_scale)
        net = tf.keras.layers.concatenate([branch1, branch2, branch3, branch4])
        # [batch, time, feature, filters*4]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters*4]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #27
0
def model(flags):
    """BiRNN attention model.

  It is based on paper:
  A neural attention model for speech command recognition
  https://arxiv.org/pdf/1808.08929.pdf

  Depending on parameter rnn_type, model can be biLSTM or biGRU

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    rnn_types = {'lstm': tf.keras.layers.LSTM, 'gru': tf.keras.layers.GRU}

    if flags.rnn_type not in rnn_types:
        ValueError('not supported RNN type ', flags.rnn_type)
    rnn = rnn_types[flags.rnn_type]

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(input_audio)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = tf.keras.layers.Conv2D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     dilation_rate=dilation_rate,
                                     strides=strides,
                                     padding='same')(net)
        net = tf.keras.layers.BatchNormalization()(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    # dims: [batch, time, feature]
    for _ in range(flags.rnn_layers):
        net = tf.keras.layers.Bidirectional(
            rnn(flags.rnn_units, return_sequences=True, unroll=True))(net)
    feature_dim = net.shape[-1]
    middle = net.shape[1] // 2  # index of middle point of sequence

    # feature vector at middle point [batch, feature]
    mid_feature = net[:, middle, :]
    # apply one projection layer with the same dim as input feature
    query = tf.keras.layers.Dense(feature_dim)(mid_feature)

    # attention weights [batch, time]
    att_weights = tf.keras.layers.Dot(axes=[1, 2])([query, net])
    att_weights = tf.keras.layers.Softmax(name='attSoftmax')(att_weights)

    # apply attention weights [batch, feature]
    net = tf.keras.layers.Dot(axes=[1, 1])([att_weights, net])

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Beispiel #28
0
def model(flags):
    """Inception model.

  It is based on paper:
  Rethinking the Inception Architecture for Computer Vision
      http://arxiv.org/abs/1512.00567
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, 1, feature]

    for stride, filters, kernel_size in zip(
            utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters),
            utils.parse(flags.cnn1_kernel_sizes)):
        net = utils.conv2d_bn(net,
                              filters, (kernel_size, 1),
                              padding='valid',
                              scale=flags.bn_scale)
        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1))(net)

    for stride, filters1, filters2, kernel_size in zip(
            utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1),
            utils.parse(flags.cnn2_filters2),
            utils.parse(flags.cnn2_kernel_sizes)):

        branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)

        branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)

        branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)

        net = tf.keras.layers.concatenate([branch1, branch2, branch3])
        # [batch, time, 1, filters*4]
        net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale)
        # [batch, time, 1, filters2]

        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1))(net)

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters*4]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Beispiel #29
0
def model(flags):
    """Xception model.

  It is based on paper:
  Xception: Deep Learning with Depthwise Separable Convolutions
      https://arxiv.org/abs/1610.02357
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=-1)
    # [batch, time, feature, 1]

    # conv block
    for kernel_size, stride, filters in zip(parse(flags.cnn1_kernel_size),
                                            parse(flags.cnn1_strides),
                                            parse(flags.cnn1_filters)):
        net = tf.keras.layers.Conv2D(filters,
                                     kernel_size,
                                     strides=stride,
                                     use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        # [batch, time, feature, filters]

    # first residual block
    for filters in parse(flags.cnn2_filters):
        residual = tf.keras.layers.Conv2D(filters, (1, 1),
                                          strides=(2, 2),
                                          padding='same',
                                          use_bias=False)(net)
        residual = tf.keras.layers.BatchNormalization(
            scale=flags.bn_scale)(residual)
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='same',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.MaxPooling2D((3, 3),
                                           strides=(2, 2),
                                           padding='same')(net)
        net = tf.keras.layers.add([net, residual])
        # [batch, time, feature, filters]

    # second residual block
    filters = parse(flags.cnn2_filters)[-1]
    for _ in range(flags.cnn3_blocks):
        residual = net
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='same',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.SeparableConv2D(
            filters,
            (3, 3),
            padding='same',
            use_bias=False,
        )(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='same',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.add([net, residual])
        # [batch, time, feature, filters]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    # [batch, label_count]
    return tf.keras.Model(input_audio, net)
Beispiel #30
0
def model(flags):
  """Inception resnet model.

  It is based on paper:
  Inception-v4, Inception-ResNet and the Impact of
     Residual Connections on Learning https://arxiv.org/abs/1602.07261
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # [batch, time, feature]
  net = tf.keras.backend.expand_dims(net, axis=2)
  # [batch, time, 1, feature]

  for filters, kernel_size, stride in zip(
      utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes),
      utils.parse(flags.cnn1_strides)):
    net = utils.conv2d_bn(
        net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid')
    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net)
    # [batch, time, 1, filters]

  for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip(
      utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales),
      utils.parse(flags.cnn2_filters_branch0),
      utils.parse(flags.cnn2_filters_branch1),
      utils.parse(flags.cnn2_filters_branch2),
      utils.parse(flags.cnn2_kernel_sizes)):
    net = inception_resnet_block(
        net,
        scale,
        filters_branch0,
        filters_branch1,
        kernel_size,
        bn_scale=flags.bn_scale)
    net = utils.conv2d_bn(
        net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid')
    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1),
                                         strides=(stride, 1),
                                         padding='valid')(
                                             net)
    # [batch, time, 1, filters]

  net = tf.keras.layers.GlobalAveragePooling2D()(net)
  # [batch, filters]
  net = tf.keras.layers.Dropout(flags.dropout)(net)
  net = tf.keras.layers.Dense(flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)