Esempio n. 1
0
def get_training_data(Flags, get_waves=False, val_cal_subset=False):

    label_count = 12
    background_frequency = Flags.background_frequency
    background_volume_range_ = Flags.background_volume
    model_settings = models.prepare_model_settings(label_count, Flags)

    bg_path = Flags.bg_path
    BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
    background_data = prepare_background_data(bg_path,
                                              BACKGROUND_NOISE_DIR_NAME)

    splits = ['train', 'test', 'validation']
    (ds_train, ds_test, ds_val), ds_info = tfds.load('speech_commands',
                                                     split=splits,
                                                     data_dir=Flags.data_dir,
                                                     with_info=True)

    if val_cal_subset:  # only return the subset of val set used for quantization calibration
        with open("quant_cal_idxs.txt") as fpi:
            cal_indices = [int(line) for line in fpi]
        cal_indices.sort()
        # cal_indices are the positions of specific inputs that are selected to calibrate the quantization
        count = 0  # count will be the index into the validation set.
        val_sub_audio = []
        val_sub_labels = []
        for d in ds_val:
            if count in cal_indices:  # this is one of the calibration inpus
                new_audio = d['audio'].numpy(
                )  # so add it to a stack of tensors
                if len(
                        new_audio
                ) < 16000:  # from_tensor_slices doesn't work for ragged tensors, so pad to 16k
                    new_audio = np.pad(new_audio, (0, 16000 - len(new_audio)),
                                       'constant')
                val_sub_audio.append(new_audio)
                val_sub_labels.append(d['label'].numpy())
            count += 1
        # and create a new dataset for just the calibration inputs.
        ds_val = tf.data.Dataset.from_tensor_slices({
            "audio": val_sub_audio,
            "label": val_sub_labels
        })

    if Flags.num_train_samples != -1:
        ds_train = ds_train.take(Flags.num_train_samples)
    if Flags.num_val_samples != -1:
        ds_val = ds_val.take(Flags.num_val_samples)
    if Flags.num_test_samples != -1:
        ds_test = ds_test.take(Flags.num_test_samples)

    if get_waves:
        ds_train = ds_train.map(cast_and_pad)
        ds_test = ds_test.map(cast_and_pad)
        ds_val = ds_val.map(cast_and_pad)
    else:
        # extract spectral features and add background noise
        ds_train = ds_train.map(
            get_preprocess_audio_func(model_settings,
                                      is_training=True,
                                      background_data=background_data),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds_test = ds_test.map(
            get_preprocess_audio_func(model_settings,
                                      is_training=False,
                                      background_data=background_data),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds_val = ds_val.map(
            get_preprocess_audio_func(model_settings,
                                      is_training=False,
                                      background_data=background_data),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)
        # change output from a dictionary to a feature,label tuple
        ds_train = ds_train.map(convert_dataset)
        ds_test = ds_test.map(convert_dataset)
        ds_val = ds_val.map(convert_dataset)

    # Now that we've acquired the preprocessed data, either by processing or loading,
    ds_train = ds_train.batch(Flags.batch_size)
    ds_test = ds_test.batch(Flags.batch_size)
    ds_val = ds_val.batch(Flags.batch_size)

    return ds_train, ds_test, ds_val
Esempio n. 2
0
def get_training_data(Flags):

    spectrogram_length = int((Flags.clip_duration_ms - Flags.window_size_ms +
                              Flags.window_stride_ms) / Flags.window_stride_ms)

    dct_coefficient_count = Flags.dct_coefficient_count
    window_size_ms = Flags.window_size_ms
    window_stride_ms = Flags.window_stride_ms
    clip_duration_ms = Flags.clip_duration_ms  #expected duration in ms
    sample_rate = Flags.sample_rate
    label_count = 12
    background_frequency = Flags.background_frequency
    background_volume_range_ = Flags.background_volume
    model_settings = models.prepare_model_settings(
        label_count, sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, background_frequency)
    # this is taken from the dataset web page.
    # there should be a better way than hard-coding this
    train_shuffle_buffer_size = 85511
    val_shuffle_buffer_size = 10102
    test_shuffle_buffer_size = 4890
    bg_path = Flags.bg_path
    BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
    background_data = prepare_background_data(bg_path,
                                              BACKGROUND_NOISE_DIR_NAME)
    splits = ['train', 'test', 'validation']
    (ds_train, ds_test, ds_val), ds_info = tfds.load('speech_commands',
                                                     split=splits,
                                                     data_dir=Flags.data_dir,
                                                     with_info=True)

    ds_train = ds_train.shuffle(train_shuffle_buffer_size)
    ds_val = ds_val.shuffle(val_shuffle_buffer_size)
    ds_test = ds_test.shuffle(test_shuffle_buffer_size)

    if Flags.num_train_samples != -1:
        ds_train = ds_train.take(Flags.num_train_samples)
    if Flags.num_val_samples != -1:
        ds_val = ds_val.take(Flags.num_val_samples)
    if Flags.num_test_samples != -1:
        ds_test = ds_test.take(Flags.num_test_samples)

    ds_train_specs = ds_train.map(
        get_preprocess_audio_func(model_settings,
                                  is_training=True,
                                  background_data=background_data),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test_specs = ds_test.map(
        get_preprocess_audio_func(model_settings,
                                  is_training=False,
                                  background_data=background_data),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val_specs = ds_val.map(get_preprocess_audio_func(
        model_settings, is_training=False, background_data=background_data),
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

    ds_train_specs = ds_train_specs.map(convert_dataset)
    ds_test_specs = ds_test_specs.map(convert_dataset)
    ds_val_specs = ds_val_specs.map(convert_dataset)
    # Now that we've acquired the preprocessed data, either by processing or loading,
    ds_train_specs = ds_train_specs.batch(Flags.batch_size)
    ds_test_specs = ds_test_specs.batch(Flags.batch_size)
    ds_val_specs = ds_val_specs.batch(Flags.batch_size)

    return ds_train_specs, ds_test_specs, ds_val_specs
Esempio n. 3
0
    num_labels = len(word_labels)
    ds_train, ds_test, ds_val = kws_data.get_training_data(Flags,
                                                           val_cal_subset=True)

    if Flags.target_set[0:3].lower() == 'val':
        eval_data = ds_val
        print("Evaluating on the validation set")
    elif Flags.target_set[0:4].lower() == 'test':
        eval_data = ds_test
        print("Evaluating on the test set")
    elif Flags.target_set[0:5].lower() == 'train':
        eval_data = ds_train
        print("Evaluating on the training set")

    model_settings = models.prepare_model_settings(num_labels, Flags)

    if Flags.feature_type == "mfcc":
        output_type = np.int8
        quant_min, quant_max = -128, 127
        # we should really do both of these in the way that the LFBE is doing
        # since the MFCC style depends on a specific TFL model, but since
        # now (4/24/21) bin files for mfcc features are already published,
        # we'll wait until v0.2 to unify the bin file quantization calibration
        interpreter = tf.lite.Interpreter(model_path=Flags.tfl_file_name)
        interpreter.allocate_tensors()
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        input_shape = input_details[0]['shape']
        input_shape[0] = 0
        input_scale, input_zero_point = input_details[0]["quantization"]