def test_sliding_window_basic(self): """Test basic calls of sliding_window where the input is 1D.""" data = [1, 2, 3, 4, 5, 6, 7, 8] out = sliding_window(data, 3) np.testing.assert_array_equal( out, [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]]) out = sliding_window(data, 3, hop=2) np.testing.assert_array_equal(out, [[1, 2, 3], [3, 4, 5], [5, 6, 7]]) out = sliding_window(data, 4, hop=3) np.testing.assert_array_equal(out, [[1, 2, 3, 4], [4, 5, 6, 7]])
def test_sliding_window_multichannel(self): """Test where input is 2D, representing multiple channels.""" num_channels = 5 data = np.arange(6 * num_channels).reshape(-1, num_channels) out = sliding_window(data, 3, hop=2) np.testing.assert_array_equal(out, [[ [0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14] ], [[10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24]]])
def test_sliding_window_noncontiguous_input(self): """Test noncontiguous input, important because code uses stride_tricks.""" memory = np.arange(100) s = memory.strides[0] # Stride in bytes between successive elements. # Create a weirdly-strided 7x4 array by pointing into `memory`. data = np.lib.stride_tricks.as_strided(memory, shape=(7, 4), strides=(2 * s, 9 * s), writeable=False) np.testing.assert_array_equal( data, [[0, 9, 18, 27], [2, 11, 20, 29], [4, 13, 22, 31], [6, 15, 24, 33], [8, 17, 26, 35], [10, 19, 28, 37], [12, 21, 30, 39]]) out = sliding_window(data, 3, hop=2) np.testing.assert_array_equal( out, [[[0, 9, 18, 27], [2, 11, 20, 29], [4, 13, 22, 31]], [[4, 13, 22, 31], [6, 15, 24, 33], [8, 17, 26, 35]], [[8, 17, 26, 35], [10, 19, 28, 37], [12, 21, 30, 39]]])
def process_one_wav_file(wav_file: str) -> Dict[str, List[np.ndarray]]: """Processes one WAV file to create observed frames. Processes one TIMIT WAV file with the frontend, and uses the associated label file to group observed frames by phone. Segments shorter than FLAGS.min_phone_length_s or with labels in PHONES_TO_EXCLUDE_FROM_DATASET are skipped. Audio channels are averaged (if there are multiple channels) to reduce to mono before processing. Args: wav_file: String, WAV file path. Returns: Examples dict with values of shape (num_examples, num_frames, num_channels). `examples[phone][i]` is the input for the ith example with label `phone`. """ samples_orig, sample_rate_hz = wav_io.read_wav_file(wav_file, dtype=np.float32) samples_orig = samples_orig.mean(axis=1) phone_times = phone_util.get_phone_times( phone_util.get_phone_label_filename(wav_file)) frontend = carl_frontend.CarlFrontend(**get_frontend_params_from_flags()) examples = collections.defaultdict(list) translation = 0 for draw_index in range(FLAGS.num_draws): samples = np.copy(samples_orig) # Resample from sample_rate_hz to AUDIO_SAMPLE_RATE_HZ, perturbed up to # +/-max_resample_percent to change pitch and compress/dilate time. # TODO(getreuer): For more data augmentation, consider changing pitch and # time stretching independently. dilation_factor = AUDIO_SAMPLE_RATE_HZ / sample_rate_hz if draw_index > 0: max_log_dilation = np.log(1.0 + FLAGS.max_resample_percent / 100.0) dilation_factor *= np.exp( np.random.uniform(-max_log_dilation, max_log_dilation)) if abs(dilation_factor - 1.0) >= 1e-4: resampler = rational_factor_resampler.Resampler( 1.0, dilation_factor, max_denominator=2000) samples = resampler.process_samples(samples) if draw_index > 0: # Prepend a random fraction of a block of silence. This randomizes the # input phase with respect to the frontend's decimation by block_size. translation = np.random.randint(FLAGS.block_size) samples = np.append(np.zeros(translation), samples) # Add white Gaussian noise. samples = np.random.normal(samples, FLAGS.noise_stddev).astype(np.float32) # Scale the samples to simulate the recording at a different distance. samples /= np.exp( np.random.uniform(np.log(FLAGS.min_simulated_distance), np.log(FLAGS.max_simulated_distance))) observed = phone_util.run_frontend(frontend, samples) for start, end, phone in phone_times: start = int(round(dilation_factor * start)) + translation end = min(int(round(dilation_factor * end)), len(samples)) + translation phone_length_s = float(end - start) / sample_rate_hz # Skip short (quickly-spoken) phone segments. They are likely influenced # by preceding/following phones, making classification is less clear. if phone_length_s < FLAGS.min_phone_length_s: continue # Skip short phone. phone = COALESCE_SIMILAR_PHONES.get(phone, phone) if phone in PHONES_TO_EXCLUDE_FROM_DATASET: continue # There may be confusing transitions (or possible labeling inaccuracy) # near the segment endpoints, so trim a fraction from each end. length = end - start start += int(round(length * FLAGS.phone_trim_left)) end -= int(round(length * FLAGS.phone_trim_right)) # Convert sample indices from audio sample rate to frame rate. start //= FLAGS.block_size end //= FLAGS.block_size left_context = FLAGS.num_frames_left_context # Extract a window every `hop` frames and append to examples. examples[phone].append( sliding_window(observed[max(0, start - left_context):end], window_size=left_context + 1, hop=FLAGS.downsample_factor // frontend.block_size)) return examples
def model_fun(x): return model({'observed': sliding_window(x, window_size)})['scores']
def test_sliding_window_empty_output(self): """Test window size longer than the data, resulting in empty output.""" data = np.arange(24).reshape(4, 3, 2) out = sliding_window(data, 5) self.assertEqual(out.shape, (0, 5, 3, 2))