Beispiel #1
0
    def modify_summary(self, summary):
        # compute precision, recall and fscore
        if f'predictions' in summary['buffers']:
            k = self.cnn.out_channels[-1]
            predictions = np.array(
                summary['buffers'].pop('predictions')).reshape((-1, k))
            targets = np.array(summary['buffers'].pop('targets')).reshape(
                (-1, k))
            candidate_thresholds = Collate()([
                np.array(th)[np.linspace(0,
                                         len(th) - 1, 100).astype(np.int)]
                for th in get_candidate_thresholds(targets, predictions)
            ])
            decisions = predictions > candidate_thresholds.T[:, None]
            f, p, r = fscore(targets, decisions, event_wise=True)
            best_idx = np.argmax(f, axis=0)
            best_f = f[best_idx, np.arange(k)]
            best_thresholds = candidate_thresholds[np.arange(k), best_idx]
            summary['scalars'][f'macro_fscore'] = best_f.mean()
            summary['scalars'][f'micro_fscore'] = fscore(
                targets, predictions > best_thresholds)[0]

        for key, scalar in summary['scalars'].items():
            summary['scalars'][key] = np.mean(scalar)

        for key, image in summary['images'].items():
            if image.dim() == 4 and image.shape[1] > 1:
                image = image[:, 0]
            if image.dim() == 3:
                image = image.unsqueeze(1)
            summary['images'][key] = make_grid(image.flip(2),
                                               normalize=True,
                                               scale_each=False,
                                               nrow=1)
        return summary
Beispiel #2
0
    def __call__(self, dataset, batched_input=False):
        if self.global_shuffle:
            dataset = dataset.shuffle(reshuffle=True)

        if self.prefetch_workers > 0:
            dataset = dataset.prefetch(self.prefetch_workers,
                                       2 * self.prefetch_workers)

        if batched_input:
            dataset = dataset.unbatch()

        if self.global_shuffle and not batched_input and self.local_shuffle_buffer_size > 0:
            raise AssertionError(
                'using local_shuffle_buffer_size > 0 when global_shuffle is True and batched_input is False has no effect and is therefore inefficient'
            )
        elif self.local_shuffle_buffer_size > 0:
            dataset = dataset.shuffle(
                reshuffle=True, buffer_size=self.local_shuffle_buffer_size)

        if self.batch_size is not None:
            dataset = dataset.batch_dynamic_bucket(
                bucket_cls=DynamicExtendedTimeSeriesBucket,
                batch_size=self.batch_size,
                max_padding_rate=self.max_padding_rate,
                len_key="seq_len",
                min_label_diversity=self.min_label_diversity_in_batch,
                label_key="weak_targets",
                min_dataset_examples=self.min_dataset_examples_in_batch,
                expiration=self.bucket_expiration,
                drop_incomplete=self.drop_incomplete,
                sort_key="seq_len",
                reverse_sort=True,
            ).map(Collate()).prefetch(num_workers=1, buffer_size=4)
        return dataset
Beispiel #3
0
 def prepare_iterable(dataset, drop_incomplete=False):
     return dataset.map(event_encoder).map(finalize).prefetch(
         num_workers=num_workers, buffer_size=prefetch_buffer,
         catch_filter_exception=True
     ).batch_dynamic_bucket(
         bucket_cls=bucket_cls, batch_size=batch_size, len_key='seq_len',
         max_padding_rate=max_padding_rate, expiration=bucket_expiration,
         drop_incomplete=drop_incomplete, sort_key='seq_len',
         reverse_sort=True
     ).map(Collate())
Beispiel #4
0
def prepare_dataset(dataset,
                    audio_reader,
                    stft,
                    max_length_in_sec=1.,
                    batch_size=3,
                    is_train_set=False,
                    shuffle=False):
    def prepare_example(example):
        example['audio_path'] = example['audio_path']['observation']
        return example

    dataset = dataset.map(prepare_example)

    audio_reader = AudioReader(**audio_reader)
    dataset = dataset.map(audio_reader)

    anchor = 'random' if is_train_set else 'centered_cutout'
    if max_length_in_sec is None:
        dataset = dataset.map(lambda ex: [ex])
    else:
        segmenter = Segmenter(length=int(max_length_in_sec *
                                         audio_reader.target_sample_rate),
                              include_keys=('audio_data', ),
                              mode='max',
                              anchor=anchor)
        dataset = dataset.map(segmenter)

    stft = STFT(**stft)
    dataset = dataset.batch_map(stft)

    def finalize(example):
        return {
            'example_id': example['example_id'],
            'audio_data': example['audio_data'].astype(np.float32),
            'stft': example['stft'].astype(np.float32),
            'seq_len': example['stft'].shape[1],
        }

    dataset = dataset.batch_map(finalize)

    if shuffle:
        dataset = dataset.shuffle(reshuffle=True)
    dataset = dataset.prefetch(num_workers=8,
                               buffer_size=10 * batch_size).unbatch()
    if shuffle:
        dataset = dataset.shuffle(reshuffle=True, buffer_size=10 * batch_size)
    return dataset.batch_dynamic_time_series_bucket(
        batch_size=batch_size,
        len_key='seq_len',
        max_padding_rate=0.05,
        expiration=1000 * batch_size,
        drop_incomplete=shuffle,
        sort_key='seq_len',
        reverse_sort=True).map(Collate())
Beispiel #5
0
def prepare_dataset(dataset, training=False):
    batch_size = 16
    speaker_encoder = LabelEncoder(label_key='speaker_id',
                                   storage_dir=storage_dir,
                                   to_array=True)
    speaker_encoder.initialize_labels(dataset=dataset, verbose=True)
    dataset = dataset.map(speaker_encoder)
    audio_reader = AudioReader(source_sample_rate=16000,
                               target_sample_rate=16000)
    dataset = dataset.map(audio_reader)
    stft = STFT(shift=160, window_length=400, size=512, fading=None, pad=False)
    dataset = dataset.map(stft)
    mel_transform = MelTransform(sample_rate=16000,
                                 fft_length=512,
                                 n_mels=64,
                                 fmin=50)
    dataset = dataset.map(mel_transform)
    normalizer = Normalizer(key='mel_transform',
                            center_axis=(1, ),
                            scale_axis=(1, 2),
                            storage_dir=storage_dir)
    normalizer.initialize_moments(dataset.shuffle()[:10000].prefetch(
        num_workers=8, buffer_size=16),
                                  verbose=True)
    dataset = dataset.map(normalizer)

    def finalize(example):
        return {
            'example_id':
            example['example_id'],
            'features':
            np.moveaxis(example['mel_transform'], 1, 2).astype(np.float32),
            'seq_len':
            example['mel_transform'].shape[-2],
            'speaker_id':
            example['speaker_id'].astype(np.int)
        }

    dataset = dataset.map(finalize)

    if training:
        dataset = dataset.shuffle(reshuffle=True)
    return dataset.prefetch(num_workers=8, buffer_size=10 *
                            batch_size).batch_dynamic_time_series_bucket(
                                batch_size=batch_size,
                                len_key='seq_len',
                                max_padding_rate=0.1,
                                expiration=1000 * batch_size,
                                drop_incomplete=training,
                                sort_key='seq_len',
                                reverse_sort=True).map(Collate())
Beispiel #6
0
def prepare_dataset(dataset, training=False):
    dataset = dataset.filter(lambda ex: ex['audio_length'] > 1.3, lazy=False)
    batch_size = 24
    label_encoder = MultiHotLabelEncoder(
        label_key='events', storage_dir=storage_dir
    )
    label_encoder.initialize_labels(dataset, verbose=True)
    dataset = dataset.map(label_encoder)
    audio_reader = AudioReader(
        source_sample_rate=44100, target_sample_rate=44100
    )
    dataset = dataset.map(audio_reader)
    stft = STFT(
        shift=882, window_length=1764, size=2048, fading=None, pad=False
    )
    dataset = dataset.map(stft)
    mel_transform = MelTransform(
        sample_rate=44100, fft_length=2048, n_mels=128, fmin=50
    )
    dataset = dataset.map(mel_transform)
    # normalizer = Normalizer(
    #     key='mel_transform', center_axis=(1,), scale_axis=(1, 2),
    #     storage_dir=storage_dir
    # )
    # normalizer.initialize_moments(
    #     dataset.shuffle()[:2000].prefetch(num_workers=8, buffer_size=16),
    #     verbose=True
    # )
    # dataset = dataset.map(normalizer)

    def finalize(example):
        return {
            'example_id': example['example_id'],
            'features': np.moveaxis(example['mel_transform'].mean(0, keepdims=True), 1, 2).astype(np.float32),
            'seq_len': example['mel_transform'].shape[-2],
            'events': example['events'].astype(np.float32)
        }

    dataset = dataset.map(finalize)

    if training:
        dataset = dataset.shuffle(reshuffle=True)
    return dataset.prefetch(
        num_workers=8, buffer_size=10*batch_size
    ).batch_dynamic_time_series_bucket(
        batch_size=batch_size, len_key='seq_len', max_padding_rate=0.1,
        expiration=1000*batch_size, drop_incomplete=training,
        sort_key='seq_len', reverse_sort=True
    ).map(Collate())
Beispiel #7
0
def prepare_dataset(
        dataset,
        audio_reader, stft,
        num_workers, batch_size, max_padding_rate,
        training=False,
):

    dataset = dataset.filter(lambda ex: 10.1 > ex['audio_length'] > 1.3, lazy=False)

    if training:
        dataset = dataset.shuffle(reshuffle=True)
    audio_reader = AudioReader(**audio_reader)
    stft = STFT(**stft)
    dataset = dataset.map(audio_reader).map(stft)

    def finalize(example):
        return [
            {
                'example_id': example['example_id'],
                'stft': features[None].astype(np.float32),
                'seq_len': features.shape[0],
                'events': example['events'].astype(np.float32),
            }
            for features in example['stft']
        ]

    dataset = dataset.map(finalize)\
        .prefetch(num_workers, 10*batch_size, catch_filter_exception=True)\
        .unbatch()

    if training:
        dataset = dataset.shuffle(
            reshuffle=True, buffer_size=min(100 * batch_size, 1000)
        )
    return dataset.batch_dynamic_time_series_bucket(
        batch_size=batch_size, len_key="seq_len",
        max_padding_rate=max_padding_rate, expiration=1000*batch_size,
        drop_incomplete=training, sort_key="seq_len", reverse_sort=True
    ).map(Collate())
Beispiel #8
0
def prepare_dataset(dataset, training=False):
    batch_size = 8  #24

    def chunker(example):
        """Split stream into 4s segments"""
        # 4s at 8kHz -> 32k samples
        chunk_length = 32000
        # chunk_count = int(example['num_samples']/chunk_length)
        # if DEBUG:
        #     chunk_count = min(chunk_count, 20)
        # chunks = []
        # for chunk_id in range(chunk_count):
        #     chunk = {'example_id': example['example_id']+'_'+str(chunk_id)}
        #     num_samples = chunk_length

        #     if chunk_id == chunk_count - 1:
        #         num_samples = example['num_samples'] - (chunk_count-1) * chunk_length

        #     start = chunk_id*chunk_length
        #     end = start + num_samples
        #     chunk.update(num_samples=num_samples)
        #     chunk.update(audio_start_samples=start)
        #     chunk.update(audio_stop_samples=end)
        #     chunk.update(audio_path=example['audio_path'])
        #     chunk.update(activity=example['activity'][start:end])
        #     chunks.append(chunk)
        #     np.random.shuffle(chunks)
        # return chunks
        start = max(0,
                    np.random.randint(example['num_samples']) - chunk_length)
        stop = start + chunk_length
        example.update(num_samples=chunk_length)
        example.update(audio_start_samples=start)
        example.update(audio_stop_samples=stop)
        example.update(audio_path=example['audio_path'])
        example.update(activity=example['activity'][start:stop])
        return example

    def select_speech(example):
        """Cut out a section with speech for evaluation.

        We evaluate the model on 30s audio segments which contain speech."""
        first_speech = example['activity'].intervals[0][0]
        max_time_buffer = 8000 * 15  # 15s
        time_buffer = np.random.randint(max_time_buffer)
        length = 8000 * 30  # 30s
        start = max(0, first_speech - time_buffer)
        stop = start + length
        example['audio_start_samples'] = start
        example['audio_stop_samples'] = stop
        example['activity'] = example['activity'][start:stop]
        return example

    if training:
        dataset = dataset.shuffle(reshuffle=True)

    dataset = dataset.prefetch(num_workers=8, buffer_size=10 * batch_size)

    if training:
        dataset = dataset.map(chunker)
    else:
        dataset = dataset.map(select_speech)

    audio_reader = AudioReader(source_sample_rate=8000,
                               target_sample_rate=8000)
    dataset = dataset.map(audio_reader)

    STFT_SHIFT = 80
    STFT_WINDOW_LENGTH = 400
    STFT_SIZE = 512
    STFT_PAD = True

    stft = STFT(
        shift=STFT_SHIFT,
        size=STFT_SIZE,
        window_length=STFT_WINDOW_LENGTH,
        pad=STFT_PAD,
        fading='half'  # was None
    )

    def segment(array):
        frames = int(array.shape[0] / STFT_SHIFT)
        output = np.zeros(frames)
        for i in range(frames):
            middle = i * STFT_SHIFT
            start = max(0, middle - STFT_WINDOW_LENGTH)
            stop = min(middle + STFT_WINDOW_LENGTH, array.shape[0] - 1)
            output[i] = array[start:stop].any()
        return output

    def calculate_stft(example):
        complex_spectrum = stft(example['audio_data'].flatten())
        spectrum_magnitude = np.abs(complex_spectrum)**2
        real_magnitude = spectrum_magnitude.astype(np.float32)
        real_magnitude = real_magnitude[None, None, ...]
        example['features'] = rearrange(real_magnitude,
                                        'b c f t -> b c t f',
                                        b=1,
                                        c=1)[:, :, :-1, :]
        example['activity'] = segment(example['activity'])
        # example['activity'] = segment_axis(example['activity'],
        #                                    length=STFT_WINDOW_LENGTH,
        #                                    shift=STFT_SHIFT,
        #                                    end='pad' if STFT_PAD else 'cut'
        #                                    ).any(axis=-1)
        return example

    dataset = dataset.map(calculate_stft)

    def finalize(example):
        return {
            'example_id': example['example_id'],
            'features': Variable(torch.from_numpy(example['features'])),
            'seq_len': example['features'].shape[-1],
            'activity': example['activity'][:].astype(np.float32)
        }

    dataset = dataset.map(finalize)

    dataset = dataset.batch(batch_size).map(Collate(to_tensor=True))

    def unpack_tensor(batch):
        batch['features'] = Variable(
            torch.from_numpy(np.vstack(batch['features'])))
        return batch

    dataset = dataset.map(unpack_tensor)
    return dataset
Beispiel #9
0
def prepare_dataset(dataset, max_length=1., batch_size=3, training=False):
    dataset = dataset.filter(lambda ex: ex['num_samples'] > 16000, lazy=False)
    stft_shift = 160
    window_length = 480
    target_sample_rate = 16000

    def prepare_example(example):
        example['audio_path'] = example['audio_path']['observation']
        example['speaker_id'] = example['speaker_id'].split('-')[0]
        return example

    dataset = dataset.map(prepare_example)

    audio_reader = AudioReader(source_sample_rate=16000,
                               target_sample_rate=target_sample_rate)
    dataset = dataset.map(audio_reader)

    stft = STFT(shift=stft_shift,
                window_length=window_length,
                size=512,
                fading='full',
                pad=True)
    dataset = dataset.map(stft)

    def fragment(example):
        audio, features = example['audio_data'], example['stft']
        pad_width = window_length - stft_shift
        assert pad_width > 0, pad_width
        audio = np.pad(audio, (audio.ndim - 1) * [(0, 0)] +
                       [(pad_width, window_length - 1)],
                       mode='constant')
        fragment_step = int(max_length * target_sample_rate)
        fragment_length = fragment_step + 2 * pad_width
        stft_fragment_step = fragment_step / stft_shift
        stft_fragment_length = stft.samples_to_frames(fragment_step)
        fragments = []
        for audio, features in zip(*fragment_signal(
                audio,
                features,
                axis=1,
                step=[fragment_step, stft_fragment_step],
                max_length=[fragment_length, stft_fragment_length],
                min_length=[fragment_length, stft_fragment_length],
                random_start=training)):
            fragments.append({
                'example_id':
                example['example_id'],
                'audio_data':
                audio[..., pad_width:-pad_width].astype(np.float32),
                'stft':
                features.astype(np.float32),
                'seq_len':
                features.shape[1],
            })
        return fragments

    dataset = dataset.map(fragment)

    if training:
        dataset = dataset.shuffle(reshuffle=True)
    return dataset.prefetch(num_workers=8,
                            buffer_size=10 * batch_size).unbatch().shuffle(
                                reshuffle=True,
                                buffer_size=10 * batch_size).batch(
                                    batch_size=batch_size).map(Collate())
Beispiel #10
0
def prepare_dataset(dataset,
                    audio_reader,
                    stft,
                    max_length=1.,
                    batch_size=3,
                    shuffle=False):
    def prepare_example(example):
        example['audio_path'] = example['audio_path']['observation']
        return example

    dataset = dataset.map(prepare_example)

    audio_reader = AudioReader(**audio_reader)
    dataset = dataset.map(audio_reader)

    stft = STFT(**stft)
    dataset = dataset.map(stft)

    def fragment(example):
        num_samples, audio, features = example['num_samples'], example[
            'audio_data'], example['stft']
        audio_len = num_samples / audio_reader.target_sample_rate
        pad_width = stft.window_length - stft.shift
        assert pad_width > 0, pad_width
        audio = np.pad(audio, (audio.ndim - 1) * [(0, 0)] +
                       [(pad_width, stft.window_length - 1)],
                       mode='constant')
        n = 1 if max_length is None else int(np.ceil(audio_len / max_length))
        fragment_len = audio_len / n
        sample_fragment_step = int(audio_reader.target_sample_rate *
                                   fragment_len)
        stft_fragment_step = sample_fragment_step // stft.shift
        sample_fragment_step = stft_fragment_step * stft.shift
        stft_fragment_len = stft.samples_to_frames(sample_fragment_step)
        sample_fragment_len = sample_fragment_step + 2 * pad_width
        fragments = []
        for audio, features in zip(*fragment_signal(
                audio,
                features,
                axis=1,
                step=[sample_fragment_step, stft_fragment_step],
                fragment_length=[sample_fragment_len, stft_fragment_len],
                onset_mode='random' if shuffle else 'center')):
            fragments.append({
                'example_id':
                example['example_id'],
                'audio_data':
                audio[..., pad_width:-pad_width].astype(np.float32),
                'stft':
                features.astype(np.float32),
                'seq_len':
                features.shape[1],
            })
        return fragments

    dataset = dataset.map(fragment)

    if shuffle:
        dataset = dataset.shuffle(reshuffle=True)
    dataset = dataset.prefetch(num_workers=8,
                               buffer_size=10 * batch_size).unbatch()
    if shuffle:
        dataset = dataset.shuffle(reshuffle=True, buffer_size=10 * batch_size)
    return dataset.batch_dynamic_time_series_bucket(
        batch_size=batch_size,
        len_key='seq_len',
        max_padding_rate=0.05,
        expiration=1000 * batch_size,
        drop_incomplete=shuffle,
        sort_key='seq_len',
        reverse_sort=True).map(Collate())
Beispiel #11
0
def prepare_dataset(dataset, storage_dir, training=False):
    dataset = dataset.filter(lambda ex: ex['num_samples'] > 16000, lazy=False)
    batch_size = 3
    stft_shift = 160
    window_length = 480
    target_sample_rate = 16000

    def prepare_example(example):
        example['audio_path'] = example['audio_path']['observation']
        example['speaker_id'] = example['speaker_id'].split('-')[0]
        return example

    dataset = dataset.map(prepare_example)

    audio_reader = AudioReader(
        source_sample_rate=16000, target_sample_rate=target_sample_rate
    )
    dataset = dataset.map(audio_reader)

    stft = STFT(
        shift=stft_shift, window_length=window_length, size=512, fading='full',
        pad=True
    )
    dataset = dataset.map(stft)
    mel_transform = MelTransform(
        sample_rate=target_sample_rate, fft_length=512, n_mels=64, fmin=50
    )
    dataset = dataset.map(mel_transform)
    normalizer = Normalizer(
        key='mel_transform', center_axis=(1,), scale_axis=(1, 2),
        storage_dir=storage_dir
    )
    normalizer.initialize_moments(
        dataset.shuffle()[:10000].prefetch(num_workers=8, buffer_size=16),
        verbose=True
    )
    dataset = dataset.map(normalizer)

    def fragment(example):
        audio, features = example['audio_data'], example['mel_transform']
        pad_width = window_length - stft_shift
        assert pad_width > 0, pad_width
        audio = np.pad(
            audio, (audio.ndim-1)*[(0, 0)] + [(pad_width, window_length - 1)],
            mode='constant')
        fragment_step = 16000
        fragment_length = fragment_step + 2*pad_width
        mel_fragment_step = fragment_step / stft_shift
        mel_fragment_length = stft.samples_to_frames(fragment_step)
        fragments = []
        for audio, features in zip(*fragment_parallel_signals(
            signals=[audio, features], axis=1,
            step=[fragment_step, mel_fragment_step],
            max_length=[fragment_length, mel_fragment_length],
            min_length=[fragment_length, mel_fragment_length],
            random_start=training
        )):
            fragments.append({
                'example_id': example['example_id'],
                'audio_data': audio[..., pad_width:-pad_width].squeeze(0).astype(np.float32),
                'features': np.moveaxis(features.squeeze(0), 0, 1).astype(np.float32)
            })
        return fragments

    dataset = dataset.map(fragment)

    if training:
        dataset = dataset.shuffle(reshuffle=True)
    return dataset.prefetch(
        num_workers=8, buffer_size=10*batch_size
    ).unbatch().shuffle(reshuffle=True, buffer_size=10*batch_size).batch(
        batch_size=batch_size
    ).map(Collate())