def modify_summary(self, summary): # compute precision, recall and fscore if f'predictions' in summary['buffers']: k = self.cnn.out_channels[-1] predictions = np.array( summary['buffers'].pop('predictions')).reshape((-1, k)) targets = np.array(summary['buffers'].pop('targets')).reshape( (-1, k)) candidate_thresholds = Collate()([ np.array(th)[np.linspace(0, len(th) - 1, 100).astype(np.int)] for th in get_candidate_thresholds(targets, predictions) ]) decisions = predictions > candidate_thresholds.T[:, None] f, p, r = fscore(targets, decisions, event_wise=True) best_idx = np.argmax(f, axis=0) best_f = f[best_idx, np.arange(k)] best_thresholds = candidate_thresholds[np.arange(k), best_idx] summary['scalars'][f'macro_fscore'] = best_f.mean() summary['scalars'][f'micro_fscore'] = fscore( targets, predictions > best_thresholds)[0] for key, scalar in summary['scalars'].items(): summary['scalars'][key] = np.mean(scalar) for key, image in summary['images'].items(): if image.dim() == 4 and image.shape[1] > 1: image = image[:, 0] if image.dim() == 3: image = image.unsqueeze(1) summary['images'][key] = make_grid(image.flip(2), normalize=True, scale_each=False, nrow=1) return summary
def __call__(self, dataset, batched_input=False): if self.global_shuffle: dataset = dataset.shuffle(reshuffle=True) if self.prefetch_workers > 0: dataset = dataset.prefetch(self.prefetch_workers, 2 * self.prefetch_workers) if batched_input: dataset = dataset.unbatch() if self.global_shuffle and not batched_input and self.local_shuffle_buffer_size > 0: raise AssertionError( 'using local_shuffle_buffer_size > 0 when global_shuffle is True and batched_input is False has no effect and is therefore inefficient' ) elif self.local_shuffle_buffer_size > 0: dataset = dataset.shuffle( reshuffle=True, buffer_size=self.local_shuffle_buffer_size) if self.batch_size is not None: dataset = dataset.batch_dynamic_bucket( bucket_cls=DynamicExtendedTimeSeriesBucket, batch_size=self.batch_size, max_padding_rate=self.max_padding_rate, len_key="seq_len", min_label_diversity=self.min_label_diversity_in_batch, label_key="weak_targets", min_dataset_examples=self.min_dataset_examples_in_batch, expiration=self.bucket_expiration, drop_incomplete=self.drop_incomplete, sort_key="seq_len", reverse_sort=True, ).map(Collate()).prefetch(num_workers=1, buffer_size=4) return dataset
def prepare_iterable(dataset, drop_incomplete=False): return dataset.map(event_encoder).map(finalize).prefetch( num_workers=num_workers, buffer_size=prefetch_buffer, catch_filter_exception=True ).batch_dynamic_bucket( bucket_cls=bucket_cls, batch_size=batch_size, len_key='seq_len', max_padding_rate=max_padding_rate, expiration=bucket_expiration, drop_incomplete=drop_incomplete, sort_key='seq_len', reverse_sort=True ).map(Collate())
def prepare_dataset(dataset, audio_reader, stft, max_length_in_sec=1., batch_size=3, is_train_set=False, shuffle=False): def prepare_example(example): example['audio_path'] = example['audio_path']['observation'] return example dataset = dataset.map(prepare_example) audio_reader = AudioReader(**audio_reader) dataset = dataset.map(audio_reader) anchor = 'random' if is_train_set else 'centered_cutout' if max_length_in_sec is None: dataset = dataset.map(lambda ex: [ex]) else: segmenter = Segmenter(length=int(max_length_in_sec * audio_reader.target_sample_rate), include_keys=('audio_data', ), mode='max', anchor=anchor) dataset = dataset.map(segmenter) stft = STFT(**stft) dataset = dataset.batch_map(stft) def finalize(example): return { 'example_id': example['example_id'], 'audio_data': example['audio_data'].astype(np.float32), 'stft': example['stft'].astype(np.float32), 'seq_len': example['stft'].shape[1], } dataset = dataset.batch_map(finalize) if shuffle: dataset = dataset.shuffle(reshuffle=True) dataset = dataset.prefetch(num_workers=8, buffer_size=10 * batch_size).unbatch() if shuffle: dataset = dataset.shuffle(reshuffle=True, buffer_size=10 * batch_size) return dataset.batch_dynamic_time_series_bucket( batch_size=batch_size, len_key='seq_len', max_padding_rate=0.05, expiration=1000 * batch_size, drop_incomplete=shuffle, sort_key='seq_len', reverse_sort=True).map(Collate())
def prepare_dataset(dataset, training=False): batch_size = 16 speaker_encoder = LabelEncoder(label_key='speaker_id', storage_dir=storage_dir, to_array=True) speaker_encoder.initialize_labels(dataset=dataset, verbose=True) dataset = dataset.map(speaker_encoder) audio_reader = AudioReader(source_sample_rate=16000, target_sample_rate=16000) dataset = dataset.map(audio_reader) stft = STFT(shift=160, window_length=400, size=512, fading=None, pad=False) dataset = dataset.map(stft) mel_transform = MelTransform(sample_rate=16000, fft_length=512, n_mels=64, fmin=50) dataset = dataset.map(mel_transform) normalizer = Normalizer(key='mel_transform', center_axis=(1, ), scale_axis=(1, 2), storage_dir=storage_dir) normalizer.initialize_moments(dataset.shuffle()[:10000].prefetch( num_workers=8, buffer_size=16), verbose=True) dataset = dataset.map(normalizer) def finalize(example): return { 'example_id': example['example_id'], 'features': np.moveaxis(example['mel_transform'], 1, 2).astype(np.float32), 'seq_len': example['mel_transform'].shape[-2], 'speaker_id': example['speaker_id'].astype(np.int) } dataset = dataset.map(finalize) if training: dataset = dataset.shuffle(reshuffle=True) return dataset.prefetch(num_workers=8, buffer_size=10 * batch_size).batch_dynamic_time_series_bucket( batch_size=batch_size, len_key='seq_len', max_padding_rate=0.1, expiration=1000 * batch_size, drop_incomplete=training, sort_key='seq_len', reverse_sort=True).map(Collate())
def prepare_dataset(dataset, training=False): dataset = dataset.filter(lambda ex: ex['audio_length'] > 1.3, lazy=False) batch_size = 24 label_encoder = MultiHotLabelEncoder( label_key='events', storage_dir=storage_dir ) label_encoder.initialize_labels(dataset, verbose=True) dataset = dataset.map(label_encoder) audio_reader = AudioReader( source_sample_rate=44100, target_sample_rate=44100 ) dataset = dataset.map(audio_reader) stft = STFT( shift=882, window_length=1764, size=2048, fading=None, pad=False ) dataset = dataset.map(stft) mel_transform = MelTransform( sample_rate=44100, fft_length=2048, n_mels=128, fmin=50 ) dataset = dataset.map(mel_transform) # normalizer = Normalizer( # key='mel_transform', center_axis=(1,), scale_axis=(1, 2), # storage_dir=storage_dir # ) # normalizer.initialize_moments( # dataset.shuffle()[:2000].prefetch(num_workers=8, buffer_size=16), # verbose=True # ) # dataset = dataset.map(normalizer) def finalize(example): return { 'example_id': example['example_id'], 'features': np.moveaxis(example['mel_transform'].mean(0, keepdims=True), 1, 2).astype(np.float32), 'seq_len': example['mel_transform'].shape[-2], 'events': example['events'].astype(np.float32) } dataset = dataset.map(finalize) if training: dataset = dataset.shuffle(reshuffle=True) return dataset.prefetch( num_workers=8, buffer_size=10*batch_size ).batch_dynamic_time_series_bucket( batch_size=batch_size, len_key='seq_len', max_padding_rate=0.1, expiration=1000*batch_size, drop_incomplete=training, sort_key='seq_len', reverse_sort=True ).map(Collate())
def prepare_dataset( dataset, audio_reader, stft, num_workers, batch_size, max_padding_rate, training=False, ): dataset = dataset.filter(lambda ex: 10.1 > ex['audio_length'] > 1.3, lazy=False) if training: dataset = dataset.shuffle(reshuffle=True) audio_reader = AudioReader(**audio_reader) stft = STFT(**stft) dataset = dataset.map(audio_reader).map(stft) def finalize(example): return [ { 'example_id': example['example_id'], 'stft': features[None].astype(np.float32), 'seq_len': features.shape[0], 'events': example['events'].astype(np.float32), } for features in example['stft'] ] dataset = dataset.map(finalize)\ .prefetch(num_workers, 10*batch_size, catch_filter_exception=True)\ .unbatch() if training: dataset = dataset.shuffle( reshuffle=True, buffer_size=min(100 * batch_size, 1000) ) return dataset.batch_dynamic_time_series_bucket( batch_size=batch_size, len_key="seq_len", max_padding_rate=max_padding_rate, expiration=1000*batch_size, drop_incomplete=training, sort_key="seq_len", reverse_sort=True ).map(Collate())
def prepare_dataset(dataset, training=False): batch_size = 8 #24 def chunker(example): """Split stream into 4s segments""" # 4s at 8kHz -> 32k samples chunk_length = 32000 # chunk_count = int(example['num_samples']/chunk_length) # if DEBUG: # chunk_count = min(chunk_count, 20) # chunks = [] # for chunk_id in range(chunk_count): # chunk = {'example_id': example['example_id']+'_'+str(chunk_id)} # num_samples = chunk_length # if chunk_id == chunk_count - 1: # num_samples = example['num_samples'] - (chunk_count-1) * chunk_length # start = chunk_id*chunk_length # end = start + num_samples # chunk.update(num_samples=num_samples) # chunk.update(audio_start_samples=start) # chunk.update(audio_stop_samples=end) # chunk.update(audio_path=example['audio_path']) # chunk.update(activity=example['activity'][start:end]) # chunks.append(chunk) # np.random.shuffle(chunks) # return chunks start = max(0, np.random.randint(example['num_samples']) - chunk_length) stop = start + chunk_length example.update(num_samples=chunk_length) example.update(audio_start_samples=start) example.update(audio_stop_samples=stop) example.update(audio_path=example['audio_path']) example.update(activity=example['activity'][start:stop]) return example def select_speech(example): """Cut out a section with speech for evaluation. We evaluate the model on 30s audio segments which contain speech.""" first_speech = example['activity'].intervals[0][0] max_time_buffer = 8000 * 15 # 15s time_buffer = np.random.randint(max_time_buffer) length = 8000 * 30 # 30s start = max(0, first_speech - time_buffer) stop = start + length example['audio_start_samples'] = start example['audio_stop_samples'] = stop example['activity'] = example['activity'][start:stop] return example if training: dataset = dataset.shuffle(reshuffle=True) dataset = dataset.prefetch(num_workers=8, buffer_size=10 * batch_size) if training: dataset = dataset.map(chunker) else: dataset = dataset.map(select_speech) audio_reader = AudioReader(source_sample_rate=8000, target_sample_rate=8000) dataset = dataset.map(audio_reader) STFT_SHIFT = 80 STFT_WINDOW_LENGTH = 400 STFT_SIZE = 512 STFT_PAD = True stft = STFT( shift=STFT_SHIFT, size=STFT_SIZE, window_length=STFT_WINDOW_LENGTH, pad=STFT_PAD, fading='half' # was None ) def segment(array): frames = int(array.shape[0] / STFT_SHIFT) output = np.zeros(frames) for i in range(frames): middle = i * STFT_SHIFT start = max(0, middle - STFT_WINDOW_LENGTH) stop = min(middle + STFT_WINDOW_LENGTH, array.shape[0] - 1) output[i] = array[start:stop].any() return output def calculate_stft(example): complex_spectrum = stft(example['audio_data'].flatten()) spectrum_magnitude = np.abs(complex_spectrum)**2 real_magnitude = spectrum_magnitude.astype(np.float32) real_magnitude = real_magnitude[None, None, ...] example['features'] = rearrange(real_magnitude, 'b c f t -> b c t f', b=1, c=1)[:, :, :-1, :] example['activity'] = segment(example['activity']) # example['activity'] = segment_axis(example['activity'], # length=STFT_WINDOW_LENGTH, # shift=STFT_SHIFT, # end='pad' if STFT_PAD else 'cut' # ).any(axis=-1) return example dataset = dataset.map(calculate_stft) def finalize(example): return { 'example_id': example['example_id'], 'features': Variable(torch.from_numpy(example['features'])), 'seq_len': example['features'].shape[-1], 'activity': example['activity'][:].astype(np.float32) } dataset = dataset.map(finalize) dataset = dataset.batch(batch_size).map(Collate(to_tensor=True)) def unpack_tensor(batch): batch['features'] = Variable( torch.from_numpy(np.vstack(batch['features']))) return batch dataset = dataset.map(unpack_tensor) return dataset
def prepare_dataset(dataset, max_length=1., batch_size=3, training=False): dataset = dataset.filter(lambda ex: ex['num_samples'] > 16000, lazy=False) stft_shift = 160 window_length = 480 target_sample_rate = 16000 def prepare_example(example): example['audio_path'] = example['audio_path']['observation'] example['speaker_id'] = example['speaker_id'].split('-')[0] return example dataset = dataset.map(prepare_example) audio_reader = AudioReader(source_sample_rate=16000, target_sample_rate=target_sample_rate) dataset = dataset.map(audio_reader) stft = STFT(shift=stft_shift, window_length=window_length, size=512, fading='full', pad=True) dataset = dataset.map(stft) def fragment(example): audio, features = example['audio_data'], example['stft'] pad_width = window_length - stft_shift assert pad_width > 0, pad_width audio = np.pad(audio, (audio.ndim - 1) * [(0, 0)] + [(pad_width, window_length - 1)], mode='constant') fragment_step = int(max_length * target_sample_rate) fragment_length = fragment_step + 2 * pad_width stft_fragment_step = fragment_step / stft_shift stft_fragment_length = stft.samples_to_frames(fragment_step) fragments = [] for audio, features in zip(*fragment_signal( audio, features, axis=1, step=[fragment_step, stft_fragment_step], max_length=[fragment_length, stft_fragment_length], min_length=[fragment_length, stft_fragment_length], random_start=training)): fragments.append({ 'example_id': example['example_id'], 'audio_data': audio[..., pad_width:-pad_width].astype(np.float32), 'stft': features.astype(np.float32), 'seq_len': features.shape[1], }) return fragments dataset = dataset.map(fragment) if training: dataset = dataset.shuffle(reshuffle=True) return dataset.prefetch(num_workers=8, buffer_size=10 * batch_size).unbatch().shuffle( reshuffle=True, buffer_size=10 * batch_size).batch( batch_size=batch_size).map(Collate())
def prepare_dataset(dataset, audio_reader, stft, max_length=1., batch_size=3, shuffle=False): def prepare_example(example): example['audio_path'] = example['audio_path']['observation'] return example dataset = dataset.map(prepare_example) audio_reader = AudioReader(**audio_reader) dataset = dataset.map(audio_reader) stft = STFT(**stft) dataset = dataset.map(stft) def fragment(example): num_samples, audio, features = example['num_samples'], example[ 'audio_data'], example['stft'] audio_len = num_samples / audio_reader.target_sample_rate pad_width = stft.window_length - stft.shift assert pad_width > 0, pad_width audio = np.pad(audio, (audio.ndim - 1) * [(0, 0)] + [(pad_width, stft.window_length - 1)], mode='constant') n = 1 if max_length is None else int(np.ceil(audio_len / max_length)) fragment_len = audio_len / n sample_fragment_step = int(audio_reader.target_sample_rate * fragment_len) stft_fragment_step = sample_fragment_step // stft.shift sample_fragment_step = stft_fragment_step * stft.shift stft_fragment_len = stft.samples_to_frames(sample_fragment_step) sample_fragment_len = sample_fragment_step + 2 * pad_width fragments = [] for audio, features in zip(*fragment_signal( audio, features, axis=1, step=[sample_fragment_step, stft_fragment_step], fragment_length=[sample_fragment_len, stft_fragment_len], onset_mode='random' if shuffle else 'center')): fragments.append({ 'example_id': example['example_id'], 'audio_data': audio[..., pad_width:-pad_width].astype(np.float32), 'stft': features.astype(np.float32), 'seq_len': features.shape[1], }) return fragments dataset = dataset.map(fragment) if shuffle: dataset = dataset.shuffle(reshuffle=True) dataset = dataset.prefetch(num_workers=8, buffer_size=10 * batch_size).unbatch() if shuffle: dataset = dataset.shuffle(reshuffle=True, buffer_size=10 * batch_size) return dataset.batch_dynamic_time_series_bucket( batch_size=batch_size, len_key='seq_len', max_padding_rate=0.05, expiration=1000 * batch_size, drop_incomplete=shuffle, sort_key='seq_len', reverse_sort=True).map(Collate())
def prepare_dataset(dataset, storage_dir, training=False): dataset = dataset.filter(lambda ex: ex['num_samples'] > 16000, lazy=False) batch_size = 3 stft_shift = 160 window_length = 480 target_sample_rate = 16000 def prepare_example(example): example['audio_path'] = example['audio_path']['observation'] example['speaker_id'] = example['speaker_id'].split('-')[0] return example dataset = dataset.map(prepare_example) audio_reader = AudioReader( source_sample_rate=16000, target_sample_rate=target_sample_rate ) dataset = dataset.map(audio_reader) stft = STFT( shift=stft_shift, window_length=window_length, size=512, fading='full', pad=True ) dataset = dataset.map(stft) mel_transform = MelTransform( sample_rate=target_sample_rate, fft_length=512, n_mels=64, fmin=50 ) dataset = dataset.map(mel_transform) normalizer = Normalizer( key='mel_transform', center_axis=(1,), scale_axis=(1, 2), storage_dir=storage_dir ) normalizer.initialize_moments( dataset.shuffle()[:10000].prefetch(num_workers=8, buffer_size=16), verbose=True ) dataset = dataset.map(normalizer) def fragment(example): audio, features = example['audio_data'], example['mel_transform'] pad_width = window_length - stft_shift assert pad_width > 0, pad_width audio = np.pad( audio, (audio.ndim-1)*[(0, 0)] + [(pad_width, window_length - 1)], mode='constant') fragment_step = 16000 fragment_length = fragment_step + 2*pad_width mel_fragment_step = fragment_step / stft_shift mel_fragment_length = stft.samples_to_frames(fragment_step) fragments = [] for audio, features in zip(*fragment_parallel_signals( signals=[audio, features], axis=1, step=[fragment_step, mel_fragment_step], max_length=[fragment_length, mel_fragment_length], min_length=[fragment_length, mel_fragment_length], random_start=training )): fragments.append({ 'example_id': example['example_id'], 'audio_data': audio[..., pad_width:-pad_width].squeeze(0).astype(np.float32), 'features': np.moveaxis(features.squeeze(0), 0, 1).astype(np.float32) }) return fragments dataset = dataset.map(fragment) if training: dataset = dataset.shuffle(reshuffle=True) return dataset.prefetch( num_workers=8, buffer_size=10*batch_size ).unbatch().shuffle(reshuffle=True, buffer_size=10*batch_size).batch( batch_size=batch_size ).map(Collate())