Ejemplo n.º 1
0
    def setUp(self):
        super(FrameTestBase, self).setUp()

        self.frame_size = 7
        self.frame_step = 5
        self.inference_batch_size = 1

        # generate input signal
        np.random.seed(1)
        self.data_size = 33
        self.signal = np.random.rand(self.inference_batch_size, self.data_size)

        # non streaming frame extraction based on tf.signal.frame
        data_frame_tf = dataframe.DataFrame(
            mode=Modes.TRAINING,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step)
        # it receives all data with size: data_size
        input1 = tf.keras.layers.Input(shape=(self.data_size, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output1 = data_frame_tf(inputs=input1)
        self.model_tf = tf.keras.models.Model(input1, output1)

        # generate frames for the whole signal (no streaming here)
        self.output_frames_tf = self.model_tf.predict(self.signal)
Ejemplo n.º 2
0
def model(flags):
    """Fully connected layer based model on raw wav data.

  It is based on paper (with added pooling and raw audio data):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    if flags.preprocess != 'raw':
        ValueError('input audio has to be raw, but get ', flags.preprocess)

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = dataframe.DataFrame(
        frame_size=flags.window_size_samples,
        frame_step=flags.window_stride_samples)(input_audio)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)

    # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
    if flags.pool_size > 1:
        # add fake dim for compatibility with pooling
        net = tf.keras.backend.expand_dims(net, axis=-1)
        net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size,
                                        strides=flags.strides,
                                        data_format='channels_last')(net)
        # remove fake dim
        net = tf.keras.backend.squeeze(net, axis=-1)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Ejemplo n.º 3
0
    def test_tf_non_streaming_vs_streaming_internal_state(self):
        # prepare streaming frame extraction model with internal state
        data_frame_stream = dataframe.DataFrame(
            mode=Modes.STREAM_INTERNAL_STATE_INFERENCE,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step)
        # it received input data incrementally with step: frame_step
        input2 = tf.keras.layers.Input(shape=(self.frame_step, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output2 = data_frame_stream(input2)
        model_stream = tf.keras.models.Model(input2, output2)

        # initialize internal state of data framer
        pre_state = self.signal[:, 0:data_frame_stream.frame_size -
                                data_frame_stream.frame_step]
        state_init = np.concatenate(
            (np.zeros(shape=(1, data_frame_stream.frame_step),
                      dtype=np.float32), pre_state),
            axis=1)
        data_frame_stream.set_weights([state_init])

        start = self.frame_size - self.frame_step
        end = self.frame_size
        streamed_frames = []

        # run streaming frames extraction
        while end <= self.data_size:

            # next data update
            stream_update = self.signal[:, start:end]

            # get new frame from stream of data
            output_frame = model_stream.predict(stream_update)
            streamed_frames.append(output_frame)

            start = end
            end = start + self.frame_step

        # compare streaming vs non streaming frames extraction
        for i in range(0, len(self.output_frames_tf[0])):
            self.assertAllEqual(streamed_frames[i][0][0],
                                self.output_frames_tf[0][i])
Ejemplo n.º 4
0
  def build(self, input_shape):
    super(SpeechFeatures, self).build(input_shape)

    self.data_frame = dataframe.DataFrame(
        mode=self.mode,
        inference_batch_size=self.inference_batch_size,
        frame_size=self.frame_size,
        frame_step=self.frame_step)

    if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING:
      self.add_noise = tf.keras.layers.GaussianNoise(stddev=self.noise_scale)
    else:
      self.add_noise = tf.keras.layers.Lambda(lambda x: x)

    if self.params['preemph'] != 0.0:
      self.preemphasis = preemphasis.Preemphasis(
          preemph=self.params['preemph'])
    else:
      self.preemphasis = tf.keras.layers.Lambda(lambda x: x)

    if self.params['window_type'] is not None:
      self.windowing = windowing.Windowing(
          window_size=self.frame_size, window_type=self.params['window_type'])
    else:
      self.windowing = tf.keras.layers.Lambda(lambda x: x)

    # If use_tf_fft is False, we will use
    # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT
    # To increase RDFT efficiency we use properties of mel spectrum.
    # We find a range of non zero values in mel spectrum
    # and use it to compute RDFT: it will speed up computations.
    # If use_tf_fft is True, then we use TF RFFT which require
    # signal length alignment, so we disable mel_non_zero_only.
    self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel(
        use_tf_fft=self.params['use_tf_fft'],
        magnitude_squared=self.params['fft_magnitude_squared'],
        num_mel_bins=self.params['mel_num_bins'],
        lower_edge_hertz=self.params['mel_lower_edge_hertz'],
        upper_edge_hertz=self.params['mel_upper_edge_hertz'],
        sample_rate=self.params['sample_rate'],
        mel_non_zero_only=self.params['mel_non_zero_only'])

    self.log_max = tf.keras.layers.Lambda(
        lambda x: tf.math.log(tf.math.maximum(x, self.params['log_epsilon'])))

    if self.params['dct_num_features'] != 0:
      self.dct = dct.DCT(num_features=self.params['dct_num_features'])
    else:
      self.dct = tf.keras.layers.Lambda(lambda x: x)

    self.normalizer = normalizer.Normalizer(
        mean=self.mean, stddev=self.stddev)

    # in any inference mode there is no need to add dynamic logic in tf graph
    if self.params['use_spec_augment'] and self.mode == modes.Modes.TRAINING:
      self.spec_augment = spectrogram_augment.SpecAugment(
          time_masks_number=self.params['time_masks_number'],
          time_mask_max_size=self.params['time_mask_max_size'],
          frequency_masks_number=self.params['frequency_masks_number'],
          frequency_mask_max_size=self.params['frequency_mask_max_size'])
    else:
      self.spec_augment = tf.keras.layers.Lambda(lambda x: x)