def testGetFeaturesForWav(self):
        tmp_dir = self.get_temp_dir()
        wav_dir = os.path.join(tmp_dir, "wavs")
        os.mkdir(wav_dir)
        self._SaveWavFolders(wav_dir, ["a", "b", "c"], 1)
        flags = self._GetDefaultFlags()
        flags.preprocess = "average"
        flags.desired_samples = 1600
        flags.train_dir = tmp_dir
        flags.summaries_dir = tmp_dir
        flags.data_dir = wav_dir
        with self.cached_session() as sess:
            audio_processor = input_data.AudioProcessor(flags)
            sample_data = np.zeros([flags.desired_samples, 1])
            for i in range(flags.desired_samples):
                phase = i % 4
                if phase == 0:
                    sample_data[i, 0] = 0
                elif phase == 1:
                    sample_data[i, 0] = -1
                elif phase == 2:
                    sample_data[i, 0] = 0
                elif phase == 3:
                    sample_data[i, 0] = 1
            test_wav_path = os.path.join(tmp_dir, "test_wav.wav")
            input_data.save_wav_file(test_wav_path, sample_data, 16000)

            results = audio_processor.get_features_for_wav(
                test_wav_path, flags, sess)
            spectrogram = results[0]
            self.assertEqual(1, spectrogram.shape[0])
            self.assertEqual(16, spectrogram.shape[1])
            self.assertEqual(11, spectrogram.shape[2])
            self.assertNear(0, spectrogram[0, 0, 0], 0.1)
            self.assertNear(200, spectrogram[0, 0, 5], 0.1)
    def _RunGetDataTest(self, preprocess, window_size_ms):
        tmp_dir = self.get_temp_dir()
        wav_dir = os.path.join(tmp_dir, "wavs")
        os.mkdir(wav_dir)
        self._SaveWavFolders(wav_dir, ["a", "b", "c"], 100)
        background_dir = os.path.join(wav_dir, "_background_noise_")
        os.mkdir(background_dir)
        wav_data = self._GetWavData()
        for i in range(10):
            file_path = os.path.join(background_dir,
                                     "background_audio_%d.wav" % i)
            self._SaveTestWavFile(file_path, wav_data)
        flags = self._GetDefaultFlags()
        flags.window_size_ms = window_size_ms
        flags.preprocess = preprocess
        flags.train_dir = tmp_dir
        flags.data_dir = wav_dir
        flags = model_flags.update_flags(flags)
        with self.cached_session() as sess:
            audio_processor = input_data.AudioProcessor(flags)
            result_data, result_labels = audio_processor.get_data(
                10, 0, flags, 0.3, 0.1, 100, "training", 0.0, sess)

            self.assertLen(result_data, 10)
            self.assertLen(result_labels, 10)
    def testPrepareProcessingGraph(self):
        tmp_dir = self.get_temp_dir()
        wav_dir = os.path.join(tmp_dir, "wavs")
        os.mkdir(wav_dir)
        self._SaveWavFolders(wav_dir, ["a", "b", "c"], 100)
        background_dir = os.path.join(wav_dir, "_background_noise_")
        os.mkdir(background_dir)
        wav_data = self._GetWavData()
        for i in range(10):
            file_path = os.path.join(background_dir,
                                     "background_audio_%d.wav" % i)
            self._SaveTestWavFile(file_path, wav_data)
        flags = self._GetDefaultFlags()
        flags.train_dir = tmp_dir
        flags.summaries_dir = tmp_dir
        flags.data_dir = wav_dir
        audio_processor = input_data.AudioProcessor(flags)

        self.assertIsNotNone(audio_processor.wav_filename_placeholder_)
        self.assertIsNotNone(audio_processor.foreground_volume_placeholder_)
        self.assertIsNotNone(audio_processor.time_shift_padding_placeholder_)
        self.assertIsNotNone(audio_processor.time_shift_offset_placeholder_)
        self.assertIsNotNone(audio_processor.background_data_placeholder_)
        self.assertIsNotNone(audio_processor.background_volume_placeholder_)
        self.assertIsNotNone(audio_processor.output_)
 def testPrepareDataIndexEmpty(self):
     tmp_dir = self.get_temp_dir()
     self._SaveWavFolders(tmp_dir, ["a", "b", "c"], 0)
     with self.assertRaises(Exception) as e:
         flags = self._GetDefaultFlags()
         flags.train_dir = tmp_dir
         flags.summaries_dir = tmp_dir
         flags.data_dir = tmp_dir
         _ = input_data.AudioProcessor(flags)
     self.assertIn("No .wavs found", str(e.exception))
 def testPrepareDataIndexMissing(self):
     tmp_dir = self.get_temp_dir()
     self._SaveWavFolders(tmp_dir, ["a", "b", "c"], 100)
     with self.assertRaises(Exception) as e:
         flags = self._GetDefaultFlags()
         flags.train_dir = tmp_dir
         flags.summaries_dir = tmp_dir
         flags.data_dir = tmp_dir
         flags.wanted_words = "a,b,d"
         _ = input_data.AudioProcessor(flags)
     self.assertIn("Expected to find", str(e.exception))
 def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
     os.mkdir(wav_dir)
     self._SaveWavFolders(wav_dir, ["a", "b", "c"], 100)
     flags = self._GetDefaultFlags()
     flags.train_dir = tmp_dir
     flags.summaries_dir = tmp_dir
     flags.data_dir = wav_dir
     audio_processor = input_data.AudioProcessor(flags)
     result_data, result_labels = audio_processor.get_unprocessed_data(
         10, flags, "training")
     self.assertLen(result_data, 10)
     self.assertLen(result_labels, 10)
 def testPrepareDataIndex(self):
     tmp_dir = self.get_temp_dir()
     self._SaveWavFolders(tmp_dir, ["a", "b", "c"], 100)
     flags = self._GetDefaultFlags()
     flags.train_dir = tmp_dir
     flags.summaries_dir = tmp_dir
     flags.data_dir = tmp_dir
     audio_processor = input_data.AudioProcessor(flags)
     self.assertLess(0, audio_processor.set_size("training"))
     self.assertIn("training", audio_processor.data_index)
     self.assertIn("validation", audio_processor.data_index)
     self.assertIn("testing", audio_processor.data_index)
     self.assertEqual(input_data.UNKNOWN_WORD_INDEX,
                      audio_processor.word_to_index["c"])
 def testPrepareBackgroundData(self):
   tmp_dir = self.get_temp_dir()
   background_dir = os.path.join(tmp_dir, "_background_noise_")
   os.mkdir(background_dir)
   wav_data = self._GetWavData()
   for i in range(10):
     file_path = os.path.join(background_dir, "background_audio_%d.wav" % i)
     self._SaveTestWavFile(file_path, wav_data)
   self._SaveWavFolders(tmp_dir, ["a", "b", "c"], 100)
   flags = self._GetDefaultFlags()
   flags.train_dir = tmp_dir
   flags.summaries_dir = tmp_dir
   flags.data_dir = tmp_dir
   audio_processor = input_data.AudioProcessor(flags)
   self.assertLen(audio_processor.background_data, 10)
Example #9
0
def tflite_non_stream_model_accuracy(
        flags,
        folder,
        tflite_model_name='non_stream.tflite',
        accuracy_name='tflite_non_stream_model_accuracy.txt'):
    """Compute accuracy of non streamable model with TFLite.

  Model has to be converted to TFLite and stored in path+tflite_model_name
  Args:
      flags: model and data settings
      folder: folder name where model is located
      tflite_model_name: file name with tflite model
      accuracy_name: file name for storing accuracy in path + accuracy_name
  Returns:
    accuracy
  """
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)
    path = os.path.join(flags.train_dir, folder)

    audio_processor = input_data.AudioProcessor(flags)

    set_size = audio_processor.set_size('testing')

    interpreter = tf.lite.Interpreter(
        model_path=os.path.join(path, tflite_model_name))
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    inputs = []
    for s in range(len(input_details)):
        inputs.append(np.zeros(input_details[s]['shape'], dtype=np.float32))

    total_accuracy = 0.0
    count = 0.0
    inference_batch_size = 1
    for i in range(0, set_size, inference_batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            inference_batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0,
            sess)

        # set input audio data (by default input data at index 0)
        interpreter.set_tensor(input_details[0]['index'],
                               test_fingerprints.astype(np.float32))

        # run inference
        interpreter.invoke()

        # get output: classification
        out_tflite = interpreter.get_tensor(output_details[0]['index'])

        out_tflite_argmax = np.argmax(out_tflite)

        total_accuracy = total_accuracy + (test_ground_truth[0]
                                           == out_tflite_argmax)
        count = count + 1
        if i % 200 == 0 and i:
            logging.info(
                'tflite test accuracy, non stream model = %.2f%% %d out of %d',
                *(total_accuracy * 100 / count, i, set_size))

    total_accuracy = total_accuracy / count
    logging.info(
        'tflite Final test accuracy, non stream model = %.2f%% (N=%d)',
        *(total_accuracy * 100, set_size))

    with open(os.path.join(path, accuracy_name), 'wt') as fd:
        fd.write('%f on set_size %d' % (total_accuracy * 100, set_size))
    return total_accuracy * 100
Example #10
0
def tflite_stream_state_external_model_accuracy(
        flags,
        folder,
        tflite_model_name='stream_state_external.tflite',
        accuracy_name='tflite_stream_state_external_model_accuracy.txt',
        reset_state=False):
    """Compute accuracy of streamable model with external state using TFLite.

  Args:
      flags: model and data settings
      folder: folder name where model is located
      tflite_model_name: file name with tflite model
      accuracy_name: file name for storing accuracy in path + accuracy_name
      reset_state: reset state between testing sequences.
        If True - then it is non streaming testing environment: state will be
          reseted in the beginning of every test sequence and will not be
          transferred to another one (as it is done in real streaming).
  Returns:
    accuracy
  """
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)
    path = os.path.join(flags.train_dir, folder)

    logging.info('tflite stream model state external with reset_state %d',
                 reset_state)

    audio_processor = input_data.AudioProcessor(flags)

    set_size = audio_processor.set_size('testing')

    interpreter = tf.lite.Interpreter(
        model_path=os.path.join(path, tflite_model_name))
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    inputs = []
    for s in range(len(input_details)):
        inputs.append(np.zeros(input_details[s]['shape'], dtype=np.float32))

    total_accuracy = 0.0
    count = 0.0
    inference_batch_size = 1
    for i in range(0, set_size, inference_batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            inference_batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0,
            sess)

        # before processing new test sequence we can reset model state
        # if we reset model state then it is not real streaming mode
        if reset_state:
            for s in range(len(input_details)):
                inputs[s] = np.zeros(input_details[s]['shape'],
                                     dtype=np.float32)

        if flags.preprocess == 'raw':
            out_tflite = run_stream_inference_classification_tflite(
                flags, interpreter, test_fingerprints, inputs)
            out_tflite_argmax = np.argmax(out_tflite)
        else:
            for t in range(test_fingerprints.shape[1]):
                # get new frame from stream of data
                stream_update = test_fingerprints[:, t, :]
                stream_update = np.expand_dims(stream_update, axis=1)

                # [batch, time=1, feature]
                stream_update = stream_update.astype(np.float32)

                # set input audio data (by default input data at index 0)
                interpreter.set_tensor(input_details[0]['index'],
                                       stream_update)

                # set input states (index 1...)
                for s in range(1, len(input_details)):
                    interpreter.set_tensor(input_details[s]['index'],
                                           inputs[s])

                # run inference
                interpreter.invoke()

                # get output: classification
                out_tflite = interpreter.get_tensor(output_details[0]['index'])

                # get output states and set it back to input states
                # which will be fed in the next inference cycle
                for s in range(1, len(input_details)):
                    # The function `get_tensor()` returns a copy of the tensor data.
                    # Use `tensor()` in order to get a pointer to the tensor.
                    inputs[s] = interpreter.get_tensor(
                        output_details[s]['index'])

                out_tflite_argmax = np.argmax(out_tflite)

        total_accuracy = total_accuracy + (test_ground_truth[0]
                                           == out_tflite_argmax)
        count = count + 1
        if i % 200 == 0 and i:
            logging.info(
                'tflite test accuracy, stream model state external = %f %d out of %d',
                *(total_accuracy * 100 / count, i, set_size))

    total_accuracy = total_accuracy / count
    logging.info(
        'tflite Final test accuracy, stream model state external = %.2f%% (N=%d)',
        *(total_accuracy * 100, set_size))

    with open(os.path.join(path, accuracy_name), 'wt') as fd:
        fd.write('%f on set_size %d' % (total_accuracy * 100, set_size))
    return total_accuracy * 100
Example #11
0
def tf_stream_state_external_model_accuracy(
        flags,
        folder,
        weights_name='best_weights',
        accuracy_name='stream_state_external_model_accuracy_sub_set.txt',
        reset_state=False,
        max_test_samples=1000):
    """Compute accuracy of streamable model with external state using TF.

  Args:
      flags: model and data settings
      folder: folder name where accuracy report will be stored
      weights_name: file name with model weights
      accuracy_name: file name for storing accuracy in path + accuracy_name
      reset_state: reset state between testing sequences.
        If True - then it is non streaming testing environment: state will be
          reseted on every test and will not be transferred to another one (as
          it is done in real streaming).
      max_test_samples: max number of test samples. In this mode model is slow
        with TF because of batch size 1, so accuracy is computed on subset of
        testing data
  Returns:
    accuracy
  """
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)

    audio_processor = input_data.AudioProcessor(flags)
    set_size = audio_processor.set_size('testing')
    set_size = np.minimum(max_test_samples, set_size)
    inference_batch_size = 1
    tf.keras.backend.set_learning_phase(0)
    flags.batch_size = inference_batch_size  # set batch size
    model = models.MODELS[flags.model_name](flags)
    weights_path = os.path.join(flags.train_dir, weights_name)
    model.load_weights(weights_path).expect_partial()
    model_stream = utils.to_streaming_inference(
        model, flags, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE)

    logging.info('tf stream model state external with reset_state %d',
                 reset_state)

    inputs = []
    for s in range(len(model_stream.inputs)):
        inputs.append(np.zeros(model_stream.inputs[s].shape, dtype=np.float32))

    total_accuracy = 0.0
    count = 0.0
    inference_batch_size = 1
    for i in range(0, set_size, inference_batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            inference_batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0,
            sess)

        if reset_state:
            for s in range(len(model_stream.inputs)):
                inputs[s] = np.zeros(model_stream.inputs[s].shape,
                                     dtype=np.float32)

        if flags.preprocess == 'raw':
            start = 0
            end = flags.window_stride_samples
            # iterate over time samples with stride = window_stride_samples
            while end <= test_fingerprints.shape[1]:
                # get new frame from stream of data
                stream_update = test_fingerprints[:, start:end]

                # update indexes of streamed updates
                start = end
                end = start + flags.window_stride_samples

                # set input audio data (by default input data at index 0)
                inputs[0] = stream_update

                # run inference
                outputs = model_stream.predict(inputs)

                # get output states and set it back to input states
                # which will be fed in the next inference cycle
                for s in range(1, len(model_stream.inputs)):
                    inputs[s] = outputs[s]

                stream_output_arg = np.argmax(outputs[0])
        else:
            # iterate over frames
            for t in range(test_fingerprints.shape[1]):
                # get new frame from stream of data
                stream_update = test_fingerprints[:, t, :]

                # [batch, time=1, feature]
                stream_update = np.expand_dims(stream_update, axis=1)

                # set input audio data (by default input data at index 0)
                inputs[0] = stream_update

                # run inference
                outputs = model_stream.predict(inputs)

                # get output states and set it back to input states
                # which will be fed in the next inference cycle
                for s in range(1, len(model_stream.inputs)):
                    inputs[s] = outputs[s]

                stream_output_arg = np.argmax(outputs[0])
        total_accuracy = total_accuracy + (test_ground_truth[0]
                                           == stream_output_arg)
        count = count + 1
        if i % 200 == 0 and i:
            logging.info(
                'tf test accuracy, stream model state external = %.2f%% %d out of %d',
                *(total_accuracy * 100 / count, i, set_size))

    total_accuracy = total_accuracy / count
    logging.info(
        'TF Final test accuracy of stream model state external = %.2f%% (N=%d)',
        *(total_accuracy * 100, set_size))

    path = os.path.join(flags.train_dir, folder)
    if not os.path.exists(path):
        os.makedirs(path)

    fname_summary = 'model_summary_stream_state_external'
    utils.save_model_summary(model_stream,
                             path,
                             file_name=fname_summary + '.txt')

    tf.keras.utils.plot_model(model_stream,
                              to_file=os.path.join(path,
                                                   fname_summary + '.png'),
                              show_shapes=True,
                              expand_nested=True)

    with open(os.path.join(path, accuracy_name), 'wt') as fd:
        fd.write('%f on set_size %d' % (total_accuracy * 100, set_size))
    return total_accuracy * 100
Example #12
0
def tf_stream_state_internal_model_accuracy(
        flags,
        folder,
        weights_name='best_weights',
        accuracy_name='tf_stream_state_internal_model_accuracy_sub_set.txt',
        max_test_samples=1000):
    """Compute accuracy of streamable model with internal state using TF.

  Testign model with batch size 1 can be slow, so accuracy is evaluated
  on subset of data with size max_test_samples
  Args:
      flags: model and data settings
      folder: folder name where accuracy report will be stored
      weights_name: file name with model weights
      accuracy_name: file name for storing accuracy in path + accuracy_name
      max_test_samples: max number of test samples. In this mode model is slow
        with TF because of batch size 1, so accuracy is computed on subset of
        testing data
  Returns:
    accuracy
  """
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)

    logging.info('tf stream model state internal without state resetting'
                 'between testing sequences')

    audio_processor = input_data.AudioProcessor(flags)
    set_size = audio_processor.set_size('testing')
    set_size = np.minimum(max_test_samples, set_size)
    inference_batch_size = 1
    tf.keras.backend.set_learning_phase(0)
    flags.batch_size = inference_batch_size  # set batch size
    model = models.MODELS[flags.model_name](flags)
    weights_path = os.path.join(flags.train_dir, weights_name)
    model.load_weights(weights_path).expect_partial()

    model_stream = utils.to_streaming_inference(
        model, flags, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)

    total_accuracy = 0.0
    count = 0.0
    for i in range(0, set_size, inference_batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            inference_batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0,
            sess)

        if flags.preprocess == 'raw':
            stream_output_prediction = run_stream_inference_classification(
                flags, model_stream, test_fingerprints)
            stream_output_arg = np.argmax(stream_output_prediction)
        else:
            # iterate over frames
            for t in range(test_fingerprints.shape[1]):
                # get new frame from stream of data
                stream_update = test_fingerprints[:, t, :]

                # [batch, time=1, feature]
                stream_update = np.expand_dims(stream_update, axis=1)

                # classification result of a current frame
                stream_output_prediction = model_stream.predict(stream_update)
                stream_output_arg = np.argmax(stream_output_prediction)

        total_accuracy = total_accuracy + (test_ground_truth[0]
                                           == stream_output_arg)
        count = count + 1
        if i % 200 == 0 and i:
            logging.info(
                'tf test accuracy, stream model state internal = %.2f%% %d out of %d',
                *(total_accuracy * 100 / count, i, set_size))

    total_accuracy = total_accuracy / count
    logging.info(
        'TF Final test accuracy of stream model state internal = %.2f%% (N=%d)',
        *(total_accuracy * 100, set_size))

    path = os.path.join(flags.train_dir, folder)
    if not os.path.exists(path):
        os.makedirs(path)

    fname_summary = 'model_summary_stream_state_internal'
    utils.save_model_summary(model_stream,
                             path,
                             file_name=fname_summary + '.txt')

    tf.keras.utils.plot_model(model_stream,
                              to_file=os.path.join(path,
                                                   fname_summary + '.png'),
                              show_shapes=True,
                              expand_nested=True)

    with open(os.path.join(path, accuracy_name), 'wt') as fd:
        fd.write('%f on set_size %d' % (total_accuracy * 100, set_size))
    return total_accuracy * 100
Example #13
0
def tf_non_stream_model_accuracy(
        flags,
        folder,
        time_shift_samples=0,
        weights_name='best_weights',
        accuracy_name='tf_non_stream_model_accuracy.txt'):
    """Compute accuracy of non streamable model using TF.

  Args:
      flags: model and data settings
      folder: folder name where accuracy report will be stored
      time_shift_samples: time shift of audio data it will be applied in range:
        -time_shift_samples...time_shift_samples
        We can use non stream model for processing stream of audio.
        By default it will be slow, so to speed it up
        we can use non stream model on sampled audio data:
        for example instead of computing non stream model
        on every 20ms, we can run it on every 200ms of audio stream.
        It will reduce total latency by 10 times.
        To emulate sampling effect we use time_shift_samples.
      weights_name: file name with model weights
      accuracy_name: file name for storing accuracy in path + accuracy_name
  Returns:
    accuracy
  """
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)

    audio_processor = input_data.AudioProcessor(flags)

    set_size = audio_processor.set_size('testing')
    tf.keras.backend.set_learning_phase(0)
    flags.batch_size = 100  # set batch size for inference
    set_size = int(set_size / flags.batch_size) * flags.batch_size
    model = models.MODELS[flags.model_name](flags)
    weights_path = os.path.join(flags.train_dir, weights_name)
    model.load_weights(weights_path).expect_partial()
    total_accuracy = 0.0
    count = 0.0
    for i in range(0, set_size, flags.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            flags.batch_size, i, flags, 0.0, 0.0, time_shift_samples,
            'testing', 0.0, 0.0, sess)

        predictions = model.predict(test_fingerprints)
        predicted_labels = np.argmax(predictions, axis=1)
        total_accuracy = total_accuracy + np.sum(
            predicted_labels == test_ground_truth)
        count = count + len(test_ground_truth)
    total_accuracy = total_accuracy / count

    logging.info('TF Final test accuracy on non stream model = %.2f%% (N=%d)',
                 *(total_accuracy * 100, set_size))

    path = os.path.join(flags.train_dir, folder)
    if not os.path.exists(path):
        os.makedirs(path)

    fname_summary = 'model_summary_non_stream'
    utils.save_model_summary(model, path, file_name=fname_summary + '.txt')

    tf.keras.utils.plot_model(model,
                              to_file=os.path.join(path,
                                                   fname_summary + '.png'),
                              show_shapes=True,
                              expand_nested=True)

    with open(os.path.join(path, accuracy_name), 'wt') as fd:
        fd.write('%f on set_size %d' % (total_accuracy * 100, set_size))
    return total_accuracy * 100
Example #14
0
def train(flags):
    """Model training."""

    flags.training = True

    # Set the verbosity based on flags (default is INFO, so we see all messages)
    logging.set_verbosity(flags.verbosity)

    # Start a new TensorFlow session.
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)

    audio_processor = input_data.AudioProcessor(flags)

    time_shift_samples = int((flags.time_shift_ms * flags.sample_rate) / 1000)

    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, flags.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, flags.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))
    logging.info(flags)
    model = models.MODELS[flags.model_name](flags)
    logging.info(model.summary())

    # save model summary
    utils.save_model_summary(model, flags.train_dir)

    # save model and data flags
    with open(os.path.join(flags.train_dir, 'flags.txt'), 'wt') as f:
        pprint.pprint(flags, stream=f)

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(epsilon=flags.optimizer_epsilon)

    if flags.optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(epsilon=flags.optimizer_epsilon)
    elif flags.optimizer == 'momentum':
        optimizer = tf.keras.optimizers.SGD(momentum=0.9)
    elif flags.optimizer == 'novograd':
        optimizer = tfa.optimizers.NovoGrad(
            lr=0.05,
            beta_1=flags.novograd_beta_1,
            beta_2=flags.novograd_beta_2,
            weight_decay=flags.novograd_weight_decay,
            grad_averaging=bool(flags.novograd_grad_averaging))
    else:
        raise ValueError('Unsupported optimizer:%s' % flags.optimizer)

    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    train_writer = tf.summary.FileWriter(flags.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(flags.summaries_dir +
                                              '/validation')

    sess.run(tf.global_variables_initializer())

    start_step = 1

    logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, flags.train_dir, 'graph.pbtxt')

    # Save list of words.
    with tf.io.gfile.GFile(os.path.join(flags.train_dir, 'labels.txt'),
                           'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    best_accuracy = 0.0

    # prepare parameters for exp learning rate decay
    training_steps_max = np.sum(training_steps_list)
    lr_init = learning_rates_list[0]
    exp_rate = -np.log(learning_rates_list[-1] / lr_init) / training_steps_max

    # Training loop.
    for training_step in range(start_step, training_steps_max + 1):
        offset = (training_step -
                  1) * flags.batch_size if flags.pick_deterministically else 0
        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            flags.batch_size, offset, flags, flags.background_frequency,
            flags.background_volume, time_shift_samples, 'training',
            flags.resample, flags.volume_resample, sess)

        if flags.lr_schedule == 'exp':
            learning_rate_value = lr_init * np.exp(-exp_rate * training_step)
        elif flags.lr_schedule == 'linear':
            # Figure out what the current learning rate is.
            training_steps_sum = 0
            for i in range(len(training_steps_list)):
                training_steps_sum += training_steps_list[i]
                if training_step <= training_steps_sum:
                    learning_rate_value = learning_rates_list[i]
                    break
        else:
            raise ValueError('Wrong lr_schedule: %s' % flags.lr_schedule)

        tf.keras.backend.set_value(model.optimizer.lr, learning_rate_value)
        result = model.train_on_batch(train_fingerprints, train_ground_truth)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag='accuracy', simple_value=result[1]),
        ])
        train_writer.add_summary(summary, training_step)

        logging.info(
            'Step #%d: rate %f, accuracy %.2f%%, cross entropy %f',
            *(training_step, learning_rate_value, result[1] * 100, result[0]))

        is_last_step = (training_step == training_steps_max)
        if (training_step % flags.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            set_size = int(set_size / flags.batch_size) * flags.batch_size
            total_accuracy = 0.0
            count = 0.0
            for i in range(0, set_size, flags.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(flags.batch_size, i, flags, 0.0,
                                             0.0, 0, 'validation', 0.0, 0.0,
                                             sess))

                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                result = model.test_on_batch(validation_fingerprints,
                                             validation_ground_truth)

                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='accuracy', simple_value=result[1]),
                ])

                validation_writer.add_summary(summary, training_step)

                total_accuracy += result[1]
                count = count + 1.0

            total_accuracy = total_accuracy / count
            logging.info('Step %d: Validation accuracy = %.2f%% (N=%d)',
                         *(training_step, total_accuracy * 100, set_size))

            model.save_weights(flags.train_dir + 'train/' +
                               str(int(best_accuracy * 10000)) + 'weights_' +
                               str(training_step))

            # Save the model checkpoint when validation accuracy improves
            if total_accuracy >= best_accuracy:
                best_accuracy = total_accuracy
                # overwrite the best model weights
                model.save_weights(flags.train_dir + 'best_weights')
            logging.info('So far the best validation accuracy is %.2f%%',
                         (best_accuracy * 100))

    tf.keras.backend.set_learning_phase(0)
    set_size = audio_processor.set_size('testing')
    set_size = int(set_size / flags.batch_size) * flags.batch_size
    logging.info('set_size=%d', set_size)
    total_accuracy = 0.0
    count = 0.0

    for i in range(0, set_size, flags.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            flags.batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0, sess)

        result = model.test_on_batch(test_fingerprints, test_ground_truth)

        total_accuracy += result[1]
        count = count + 1.0
    total_accuracy = total_accuracy / count

    logging.info('Final test accuracy = %.2f%% (N=%d)',
                 *(total_accuracy * 100, set_size))
    with open(os.path.join(flags.train_dir, 'accuracy_last.txt'), 'wt') as fd:
        fd.write(str(total_accuracy * 100))
    model.save_weights(flags.train_dir + 'last_weights')
Example #15
0
    def __init__(self, batch_size=512, version=1, preprocess="raw"):

        # Set PATH to data sets (for example to speech commands V2):
        # They can be downloaded from
        # https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz
        # https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz
        # https://docs.google.com/uc?export=download&id=1OAN3h4uffi5HS7eb7goklWeI2XPm1jCS
        # Files should be downloaded then extracted in the google-speech-commands directory
        dataset = "google-speech-commands"
        DATA_PATH = os.path.join("data", dataset, "data{}".format(version))

        FLAGS = model_params.Params()
        FLAGS.data_dir = DATA_PATH
        FLAGS.verbosity = logging.ERROR

        # set wanted words for V2_35 dataset
        if version == 3:
            FLAGS.wanted_words = 'visual,wow,learn,backward,dog,two,left,happy,nine,go,up,bed,stop,one,zero,tree,seven,on,four,bird,right,eight,no,six,forward,house,marvin,sheila,five,off,three,down,cat,follow,yes'
            FLAGS.split_data = 0

        # set speech feature extractor properties
        FLAGS.mel_upper_edge_hertz = 7600
        FLAGS.window_size_ms = 30.0
        FLAGS.window_stride_ms = 10.0
        FLAGS.mel_num_bins = 80
        FLAGS.dct_num_features = 40
        FLAGS.feature_type = 'mfcc_tf'
        FLAGS.preprocess = preprocess

        # for numerical correctness of streaming and non streaming models set it to 1
        # but for real use case streaming set it to 0
        FLAGS.causal_data_frame_padding = 0

        FLAGS.use_tf_fft = True
        FLAGS.mel_non_zero_only = not FLAGS.use_tf_fft

        # data augmentation parameters
        FLAGS.resample = 0.15
        FLAGS.time_shift_ms = 100
        FLAGS.use_spec_augment = 1
        FLAGS.time_masks_number = 2
        FLAGS.time_mask_max_size = 25
        FLAGS.frequency_masks_number = 2
        FLAGS.frequency_mask_max_size = 7
        FLAGS.pick_deterministically = 1

        self.flags = model_flags.update_flags(FLAGS)
        import absl
        absl.logging.set_verbosity(self.flags.verbosity)


        self.flags.batch_size = batch_size
        self.time_shift_samples = int((self.flags.time_shift_ms * self.flags.sample_rate) / 1000)


        tf1.disable_eager_execution()
        config = tf1.ConfigProto(device_count={'GPU': 0})
        self.sess = tf1.Session(config=config)
        # tf1.keras.backend.set_session(self.sess)

        self.audio_processor = input_data.AudioProcessor(self.flags)
Example #16
0
def train(flags):
  """Model training."""

  flags.training = True

  # Set the verbosity based on flags (default is INFO, so we see all messages)
  logging.set_verbosity(flags.verbosity)

  # Start a new TensorFlow session.
  tf.reset_default_graph()
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True
  sess = tf.Session(config=config)
  tf.keras.backend.set_session(sess)

  audio_processor = input_data.AudioProcessor(flags)

  time_shift_samples = int((flags.time_shift_ms * flags.sample_rate) / 1000)

  # Figure out the learning rates for each training phase. Since it's often
  # effective to have high learning rates at the start of training, followed by
  # lower levels towards the end, the number of steps and learning rates can be
  # specified as comma-separated lists to define the rate at each stage. For
  # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
  # will run 13,000 training loops in total, with a rate of 0.001 for the first
  # 10,000, and 0.0001 for the final 3,000.
  training_steps_list = list(map(int, flags.how_many_training_steps.split(',')))
  learning_rates_list = list(map(float, flags.learning_rate.split(',')))
  if len(training_steps_list) != len(learning_rates_list):
    raise Exception(
        '--how_many_training_steps and --learning_rate must be equal length '
        'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                   len(learning_rates_list)))
  logging.info(flags)

  model = models.MODELS[flags.model_name](flags)
  if flags.distill_teacher_json:
    with open(flags.distill_teacher_json, 'r') as f:
      teacher_flags = json.load(f, object_hook=lambda d: SimpleNamespace(
        **{ k: v for k, v in flags.__dict__.items() if not k in d },
        **d))
    teacher_base = models.MODELS[teacher_flags.model_name](teacher_flags)
    hard_labels = tf.keras.layers.Lambda(lambda logits: tf.one_hot(tf.math.argmax(logits, axis=-1), depth=flags.label_count))
    teacher = tf.keras.models.Sequential([teacher_base, hard_labels])
    teacher_base.trainable = False
    teacher.trainable = False
  else:
    teacher = None
    teacher_flags = None

  base_model = model

  logging.info(model.summary())

  # save model summary
  utils.save_model_summary(model, flags.train_dir)

  # save model and data flags
  with open(os.path.join(flags.train_dir, 'flags.txt'), 'wt') as f:
    pprint.pprint(flags, stream=f)

  loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=flags.label_smoothing)
  metrics = ['accuracy']

  if flags.optimizer == 'adam':
    optimizer = tf.keras.optimizers.Adam(epsilon=flags.optimizer_epsilon)
  elif flags.optimizer == 'momentum':
    optimizer = tf.keras.optimizers.SGD(momentum=0.9)
  elif flags.optimizer == 'novograd':
    optimizer = tfa.optimizers.NovoGrad(
        lr=0.05,
        beta_1=flags.novograd_beta_1,
        beta_2=flags.novograd_beta_2,
        weight_decay=flags.novograd_weight_decay,
        grad_averaging=bool(flags.novograd_grad_averaging))
  elif flags.optimizer == 'adamw':
    # Exclude some layers for weight decay
    exclude = ["pos_emb", "class_emb", "layer_normalization", "bias"]
    optimizer = AdamWeightDecay(learning_rate=0.05, weight_decay_rate=flags.l2_weight_decay, exclude_from_weight_decay=exclude)
  else:
    raise ValueError('Unsupported optimizer:%s' % flags.optimizer)

  loss_weights = [ 0.5, 0.5, 0.0 ] if teacher else [ 1. ] # equally weight losses form label and teacher, ignore ensemble output
  model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights, metrics=metrics)

  train_writer = tf.summary.FileWriter(flags.summaries_dir + '/train',
                                       sess.graph)
  validation_writer = tf.summary.FileWriter(flags.summaries_dir + '/validation')

  sess.run(tf.global_variables_initializer())

  if flags.start_checkpoint:
    model.load_weights(flags.start_checkpoint).expect_partial()
    logging.info('Weights loaded from %s', flags.start_checkpoint)

  if teacher_flags and teacher_flags.start_checkpoint:
    # Load weights into teacher base as this is the actual model that was saved, teacher includes hard label head
    teacher_base.load_weights(teacher_flags.start_checkpoint).assert_existing_objects_matched()
    logging.info('Distillation teacher weights loaded from %s', teacher_flags.start_checkpoint)

  start_step = 0

  logging.info('Training from step: %d ', start_step)

  # Save graph.pbtxt.
  tf.train.write_graph(sess.graph_def, flags.train_dir, 'graph.pbtxt')

  # Save list of words.
  with tf.io.gfile.GFile(os.path.join(flags.train_dir, 'labels.txt'), 'w') as f:
    f.write('\n'.join(audio_processor.words_list))

  best_accuracy = 0.0

  # prepare parameters for exp learning rate decay
  training_steps_max = np.sum(training_steps_list)
  lr_init = learning_rates_list[0]
  exp_rate = -np.log(learning_rates_list[-1] / lr_init)/training_steps_max
  mode = 'training'

  if flags.lr_schedule == 'cosine':
    # Currently, no restarts are performed, so it is just a cosine decay over the entire
    # training process. I think this is how DeiT does it.
    lr_init = lr_init * flags.batch_size / 512
    num_train = audio_processor.set_size(mode)
    warmup_steps = int((num_train / flags.batch_size) * flags.warmup_epochs)
    first_decay_steps=training_steps_max

  # Training loop.
  for training_step in range(start_step, training_steps_max + 1):
    if training_step > 0:
      offset = (training_step -
                1) * flags.batch_size if flags.pick_deterministically else 0

      # Pull the audio samples we'll use for training.
      train_fingerprints, train_ground_truth = audio_processor.get_data(
          flags.batch_size, offset, flags, flags.background_frequency,
          flags.background_volume, time_shift_samples, mode,
          flags.resample, flags.volume_resample, sess)

      if flags.lr_schedule == 'exp':
        learning_rate_value = lr_init * np.exp(-exp_rate * training_step)
      elif flags.lr_schedule == 'linear':
        # Figure out what the current learning rate is.
        training_steps_sum = 0
        for i in range(len(training_steps_list)):
          training_steps_sum += training_steps_list[i]
          if training_step <= training_steps_sum:
            learning_rate_value = learning_rates_list[i]
            break
      elif flags.lr_schedule == 'cosine':
        learning_rate_value = lr_init * min(1, float(training_step) / max(1, warmup_steps)) * (math.cos(math.pi * training_step / training_steps_max) + 1) / 2.
      else:
        raise ValueError('Wrong lr_schedule: %s' % flags.lr_schedule)

      tf.keras.backend.set_value(model.optimizer.learning_rate, learning_rate_value)

      one_hot_labels = tf.keras.utils.to_categorical(train_ground_truth, num_classes=flags.label_count)

      if teacher:
        teacher_labels = teacher.predict_on_batch(train_fingerprints)
        one_hot_labels = [ one_hot_labels, teacher_labels, one_hot_labels ] # third is for the ensemble output, gradient is unused

      result = model.train_on_batch(train_fingerprints, one_hot_labels)

      if teacher:
        loss_total, loss_label, loss_teacher, loss_average, acc_label, acc_teacher, acc_ensemble = result
        differences = (teacher_labels != one_hot_labels).astype(dtype=int).sum()
        logging.info(
            'Step #%d: rate %f, accuracy %.2f%%, cross entropy %f, teacher acc %.2f%% (%d diff), teacher cross entropy %f, ensemble acc %.2f%%',
            *(training_step, learning_rate_value, acc_label * 100, loss_total, acc_teacher * 100, differences, loss_teacher, acc_ensemble * 100))
        summary = tf.Summary(value=[
            tf.Summary.Value(tag='accuracy', simple_value=acc_label),
            tf.Summary.Value(tag='teacher_accuracy', simple_value=acc_teacher),
            tf.Summary.Value(tag='ensemble_accuracy', simple_value=acc_ensemble),
        ])
      else:
        loss_label, acc_label = result
        logging.info(
            'Step #%d: rate %f, accuracy %.2f%%, cross entropy %f',
            *(training_step, learning_rate_value, acc_label * 100, loss_label))
        summary = tf.Summary(value=[
            tf.Summary.Value(tag='accuracy', simple_value=acc_label),
        ])

      train_writer.add_summary(summary, training_step)

    is_last_step = (training_step == training_steps_max)
    if (training_step % flags.eval_step_interval) == 0 or is_last_step:
      set_size = audio_processor.set_size('validation')
      set_size = int(set_size / flags.batch_size) * flags.batch_size
      total_accuracy = 0.0
      count = 0.0
      for i in range(0, set_size, flags.batch_size):
        validation_fingerprints, validation_ground_truth = audio_processor.get_data(
            flags.batch_size, i, flags, 0.0,
            0.0, 0, 'validation',
            0.0, 0.0, sess)

        one_hot_labels = tf.keras.utils.to_categorical(validation_ground_truth, num_classes=flags.label_count)
        if teacher:
          one_hot_labels = [ one_hot_labels, one_hot_labels, one_hot_labels ]
        # Run a validation step and capture training summaries for TensorBoard
        # with the `merged` op.
        result = model.test_on_batch(validation_fingerprints,
                                     one_hot_labels)

        if teacher:
          loss_total, loss_label, loss_teacher, loss_average, acc_label, acc_teacher, acc_ensemble = result
          summary = tf.Summary(value=[
          tf.Summary.Value(tag='accuracy', simple_value=acc_ensemble),
          tf.Summary.Value(tag='label_head_accuracy', simple_value=acc_label),
          tf.Summary.Value(tag='distill_head_accuracy', simple_value=acc_teacher),
          ])
          accuracy = acc_ensemble
        else:
          loss_label, acc_label = result
          summary = tf.Summary(value=[
              tf.Summary.Value(tag='accuracy', simple_value=acc_label),])
          accuracy = acc_label

        validation_writer.add_summary(summary, training_step)

        total_accuracy += accuracy
        count = count + 1.0

      total_accuracy = total_accuracy / count
      logging.info('Step %d: Validation accuracy = %.2f%% (N=%d)',
                   *(training_step, total_accuracy * 100, set_size))

      # Save the model checkpoint when validation accuracy improves
      if total_accuracy >= best_accuracy:
        best_accuracy = total_accuracy
        # overwrite the best model weights
        model.save_weights(flags.train_dir + 'best_weights')
      logging.info('So far the best validation accuracy is %.2f%%',
                   (best_accuracy * 100))

  tf.keras.backend.set_learning_phase(0)
  set_size = audio_processor.set_size('testing')
  set_size = int(set_size / flags.batch_size) * flags.batch_size
  logging.info('set_size=%d', set_size)
  total_accuracy = 0.0
  count = 0.0

  for i in range(0, set_size, flags.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        flags.batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0, sess)

    one_hot_labels = tf.keras.utils.to_categorical(test_ground_truth, num_classes=flags.label_count)
    if teacher:
      one_hot_labels = [ one_hot_labels, one_hot_labels, one_hot_labels ]
    result = model.test_on_batch(test_fingerprints, one_hot_labels)

    total_accuracy += result[-1] if teacher else result[1]
    count = count + 1.0
  total_accuracy = total_accuracy / count

  logging.info('Final test accuracy = %.2f%% (N=%d)',
               *(total_accuracy * 100, set_size))
  with open(os.path.join(flags.train_dir, 'accuracy_last.txt'), 'wt') as fd:
    fd.write(str(total_accuracy * 100))
  model.save_weights(flags.train_dir + 'last_weights')