def get_training_data(Flags, get_waves=False, val_cal_subset=False): label_count = 12 background_frequency = Flags.background_frequency background_volume_range_ = Flags.background_volume model_settings = models.prepare_model_settings(label_count, Flags) bg_path = Flags.bg_path BACKGROUND_NOISE_DIR_NAME = '_background_noise_' background_data = prepare_background_data(bg_path, BACKGROUND_NOISE_DIR_NAME) splits = ['train', 'test', 'validation'] (ds_train, ds_test, ds_val), ds_info = tfds.load('speech_commands', split=splits, data_dir=Flags.data_dir, with_info=True) if val_cal_subset: # only return the subset of val set used for quantization calibration with open("quant_cal_idxs.txt") as fpi: cal_indices = [int(line) for line in fpi] cal_indices.sort() # cal_indices are the positions of specific inputs that are selected to calibrate the quantization count = 0 # count will be the index into the validation set. val_sub_audio = [] val_sub_labels = [] for d in ds_val: if count in cal_indices: # this is one of the calibration inpus new_audio = d['audio'].numpy( ) # so add it to a stack of tensors if len( new_audio ) < 16000: # from_tensor_slices doesn't work for ragged tensors, so pad to 16k new_audio = np.pad(new_audio, (0, 16000 - len(new_audio)), 'constant') val_sub_audio.append(new_audio) val_sub_labels.append(d['label'].numpy()) count += 1 # and create a new dataset for just the calibration inputs. ds_val = tf.data.Dataset.from_tensor_slices({ "audio": val_sub_audio, "label": val_sub_labels }) if Flags.num_train_samples != -1: ds_train = ds_train.take(Flags.num_train_samples) if Flags.num_val_samples != -1: ds_val = ds_val.take(Flags.num_val_samples) if Flags.num_test_samples != -1: ds_test = ds_test.take(Flags.num_test_samples) if get_waves: ds_train = ds_train.map(cast_and_pad) ds_test = ds_test.map(cast_and_pad) ds_val = ds_val.map(cast_and_pad) else: # extract spectral features and add background noise ds_train = ds_train.map( get_preprocess_audio_func(model_settings, is_training=True, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.map( get_preprocess_audio_func(model_settings, is_training=False, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val = ds_val.map( get_preprocess_audio_func(model_settings, is_training=False, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) # change output from a dictionary to a feature,label tuple ds_train = ds_train.map(convert_dataset) ds_test = ds_test.map(convert_dataset) ds_val = ds_val.map(convert_dataset) # Now that we've acquired the preprocessed data, either by processing or loading, ds_train = ds_train.batch(Flags.batch_size) ds_test = ds_test.batch(Flags.batch_size) ds_val = ds_val.batch(Flags.batch_size) return ds_train, ds_test, ds_val
def get_training_data(Flags): spectrogram_length = int((Flags.clip_duration_ms - Flags.window_size_ms + Flags.window_stride_ms) / Flags.window_stride_ms) dct_coefficient_count = Flags.dct_coefficient_count window_size_ms = Flags.window_size_ms window_stride_ms = Flags.window_stride_ms clip_duration_ms = Flags.clip_duration_ms #expected duration in ms sample_rate = Flags.sample_rate label_count = 12 background_frequency = Flags.background_frequency background_volume_range_ = Flags.background_volume model_settings = models.prepare_model_settings( label_count, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, background_frequency) # this is taken from the dataset web page. # there should be a better way than hard-coding this train_shuffle_buffer_size = 85511 val_shuffle_buffer_size = 10102 test_shuffle_buffer_size = 4890 bg_path = Flags.bg_path BACKGROUND_NOISE_DIR_NAME = '_background_noise_' background_data = prepare_background_data(bg_path, BACKGROUND_NOISE_DIR_NAME) splits = ['train', 'test', 'validation'] (ds_train, ds_test, ds_val), ds_info = tfds.load('speech_commands', split=splits, data_dir=Flags.data_dir, with_info=True) ds_train = ds_train.shuffle(train_shuffle_buffer_size) ds_val = ds_val.shuffle(val_shuffle_buffer_size) ds_test = ds_test.shuffle(test_shuffle_buffer_size) if Flags.num_train_samples != -1: ds_train = ds_train.take(Flags.num_train_samples) if Flags.num_val_samples != -1: ds_val = ds_val.take(Flags.num_val_samples) if Flags.num_test_samples != -1: ds_test = ds_test.take(Flags.num_test_samples) ds_train_specs = ds_train.map( get_preprocess_audio_func(model_settings, is_training=True, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test_specs = ds_test.map( get_preprocess_audio_func(model_settings, is_training=False, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_val_specs = ds_val.map(get_preprocess_audio_func( model_settings, is_training=False, background_data=background_data), num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train_specs = ds_train_specs.map(convert_dataset) ds_test_specs = ds_test_specs.map(convert_dataset) ds_val_specs = ds_val_specs.map(convert_dataset) # Now that we've acquired the preprocessed data, either by processing or loading, ds_train_specs = ds_train_specs.batch(Flags.batch_size) ds_test_specs = ds_test_specs.batch(Flags.batch_size) ds_val_specs = ds_val_specs.batch(Flags.batch_size) return ds_train_specs, ds_test_specs, ds_val_specs
num_labels = len(word_labels) ds_train, ds_test, ds_val = kws_data.get_training_data(Flags, val_cal_subset=True) if Flags.target_set[0:3].lower() == 'val': eval_data = ds_val print("Evaluating on the validation set") elif Flags.target_set[0:4].lower() == 'test': eval_data = ds_test print("Evaluating on the test set") elif Flags.target_set[0:5].lower() == 'train': eval_data = ds_train print("Evaluating on the training set") model_settings = models.prepare_model_settings(num_labels, Flags) if Flags.feature_type == "mfcc": output_type = np.int8 quant_min, quant_max = -128, 127 # we should really do both of these in the way that the LFBE is doing # since the MFCC style depends on a specific TFL model, but since # now (4/24/21) bin files for mfcc features are already published, # we'll wait until v0.2 to unify the bin file quantization calibration interpreter = tf.lite.Interpreter(model_path=Flags.tfl_file_name) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() input_shape = input_details[0]['shape'] input_shape[0] = 0 input_scale, input_zero_point = input_details[0]["quantization"]