Ejemplo n.º 1
0
    def init(self, shape=(8, 2), flat_dim="time"):
        self.batch_size = 1
        # input data placeholder
        input_tf = tf.keras.layers.Input(shape=shape,
                                         batch_size=self.batch_size,
                                         name="inp1")

        # input test data
        self.inputs = np.random.uniform(size=(self.batch_size, ) + shape)

        # create non streamable trainable model
        mode = Modes.TRAINING
        if flat_dim == "time":
            flat_tf = Stream(cell=tf.keras.layers.Flatten(),
                             mode=mode)(input_tf)
        else:
            flat_tf = tf.reshape(
                input_tf,
                (-1, input_tf.shape[1], input_tf.shape[2] * input_tf.shape[3]))
        # flat_tf = flatten.Flatten(mode=mode, flat_dim=flat_dim)(input_tf)
        self.model_train = tf.keras.Model(input_tf, flat_tf)
        self.model_train.summary()

        # output data, generated by non streaming model
        self.outputs = self.model_train.predict(self.inputs)
        return self.outputs
Ejemplo n.º 2
0
def model(flags):
    """Temporal Convolution ResNet model.

  It is based on paper:
  Temporal Convolution for Real-time Keyword Spotting on Mobile Devices
  https://arxiv.org/pdf/1904.03814.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    time_size, feature_size = net.shape[1:3]

    channels = utils.parse(flags.channels)

    net = tf.keras.backend.expand_dims(net)

    if flags.debug_2d:
        conv_kernel = first_conv_kernel = (3, 3)
    else:
        net = tf.reshape(
            net, [-1, time_size, 1, feature_size])  # [batch, time, 1, feature]
        first_conv_kernel = (3, 1)
        conv_kernel = utils.parse(flags.kernel_size)

    net = tf.keras.layers.Conv2D(filters=channels[0],
                                 kernel_size=first_conv_kernel,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    if utils.parse(flags.pool_size):
        net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse(
            flags.pool_size),
                                               strides=flags.pool_stride)(net)

    channels = channels[1:]

    # residual blocks
    for n in channels:
        if n != net.shape[-1]:
            stride = 2
            layer_in = tf.keras.layers.Conv2D(filters=n,
                                              kernel_size=1,
                                              strides=stride,
                                              padding='same',
                                              activation='linear')(net)
            layer_in = tf.keras.layers.BatchNormalization(
                momentum=flags.bn_momentum,
                center=flags.bn_center,
                scale=flags.bn_scale,
                renorm=flags.bn_renorm)(layer_in)
            layer_in = tf.keras.layers.Activation('relu')(layer_in)
        else:
            layer_in = net
            stride = 1

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=stride,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation('relu')(net)

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=1,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)

        # residual connection
        net = tf.keras.layers.Add()([net, layer_in])
        net = tf.keras.layers.Activation('relu')(net)

    net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3],
                                           strides=1)(net)

    net = tf.keras.layers.Dropout(rate=flags.dropout)(net)

    # fully connected layer
    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)

    net = tf.reshape(net, shape=(-1, net.shape[3]))
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Ejemplo n.º 3
0
def model(flags):
  """MatchboxNet model.

  It is based on paper
  MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network
  Architecture for Speech Commands Recognition
  https://arxiv.org/pdf/2004.08531.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

  ds_filters = parse(flags.ds_filters)
  ds_repeat = parse(flags.ds_repeat)
  ds_kernel_size = parse(flags.ds_kernel_size)
  ds_stride = parse(flags.ds_stride)
  ds_dilation = parse(flags.ds_dilation)
  ds_residual = parse(flags.ds_residual)

  for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual):
    if len(ds_filters) != len(l):
      raise ValueError('all input lists have to be the same length')

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  time_size, feature_size = net.shape[1:3]

  net = tf.keras.backend.expand_dims(net)

  net = tf.reshape(
      net, [-1, time_size, 1, feature_size])  # [batch, time, 1, feature]

  # encoder
  for filters, repeat, kernel_size, stride, dilation, residual in zip(
      ds_filters, ds_repeat, ds_kernel_size, ds_stride, ds_dilation,
      ds_residual):
    net = resnet_block(net, repeat, kernel_size, filters, dilation, stride,
                       residual, flags.padding, flags.dropout, flags.activation)

  # decoder
  net = stream.Stream(
      cell=tf.keras.layers.AveragePooling2D(
          pool_size=net.shape[1:3], strides=1))(
              net)

  net = tf.reshape(net, shape=(-1, net.shape[3]))

  net = tf.keras.layers.Dense(units=flags.label_count)(net)

  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)