Ejemplo n.º 1
0
 def __init__(self):
     super(DecoderPipeline, self).__init__(batch_size=8,
                                           num_threads=3,
                                           device_id=0,
                                           exec_async=True,
                                           exec_pipelined=True)
     self.file_source = ops.ExternalSource()
     self.plain_decoder = ops.AudioDecoder(dtype=types.INT16)
     self.resampling_decoder = ops.AudioDecoder(sample_rate=rate1,
                                                dtype=types.INT16)
     self.downmixing_decoder = ops.AudioDecoder(downmix=True,
                                                dtype=types.INT16)
     self.resampling_downmixing_decoder = ops.AudioDecoder(
         sample_rate=rate2, downmix=True, quality=50, dtype=types.FLOAT)
Ejemplo n.º 2
0
    def __init__(self, batch_size, num_threads=1, exec_async=True, exec_pipelined=True):
        super(NonsilencePipeline, self).__init__(batch_size, num_threads, 0, seed=42,
                                                 exec_async=exec_async,
                                                 exec_pipelined=exec_pipelined)
        self.input = ops.FileReader(device="cpu", file_root=audio_files)
        self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True)

        self.nonsilence = None
Ejemplo n.º 3
0
 def __init__(self, device, batch_size, nfft, window_length, window_step,
              num_threads=1, device_id=0):
     super(AudioSpectrogramPipeline, self).__init__(batch_size, num_threads, device_id)
     self.input = ops.FileReader(device="cpu", file_root=audio_files)
     self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True)
     self.fft = ops.Spectrogram(device=device,
                                nfft=nfft,
                                window_length=window_length,
                                window_step=window_step,
                                power=2)
Ejemplo n.º 4
0
    def __init__(self, batch_size, nfft, window_length, window_step,
                 num_threads=1, device_id=0, spectrogram_func=spectrogram_func_librosa):
        super(AudioSpectrogramPythonPipeline, self).__init__(
            batch_size, num_threads, device_id,
            seed=12345, exec_async=False, exec_pipelined=False)

        self.input = ops.FileReader(device="cpu", file_root=audio_files)
        self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True)

        function = partial(spectrogram_func, nfft, window_length, window_step, None)
        self.spectrogram = ops.PythonFunction(function=function)
Ejemplo n.º 5
0
    def __init__(self,
                 device_id,
                 n_devices,
                 file_root,
                 file_list,
                 batch_size,
                 sample_rate=16000,
                 window_size=.02,
                 window_stride=.01,
                 nfeatures=64,
                 nfft=512,
                 frame_splicing_factor=3,
                 silence_threshold=-80,
                 dither=.00001,
                 preemph_coeff=.97,
                 lowfreq=0.0,
                 highfreq=0.0,
                 num_threads=1):
        super().__init__(batch_size, num_threads, device_id, seed=42)

        self.dither = dither
        self.frame_splicing_factor = frame_splicing_factor

        self.read = ops.readers.File(file_root=file_root, file_list=file_list, device="cpu",
                                     shard_id=device_id, num_shards=n_devices)

        self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True)

        self.normal_distribution = ops.random.Normal(device="cpu")

        self.preemph = ops.PreemphasisFilter(preemph_coeff=preemph_coeff)

        self.spectrogram = ops.Spectrogram(device="cpu", nfft=nfft,
                                           window_length=window_size * sample_rate,
                                           window_step=window_stride * sample_rate)

        self.mel_fbank = ops.MelFilterBank(device="cpu", sample_rate=sample_rate, nfilter=nfeatures,
                                           normalize=True, freq_low=lowfreq, freq_high=highfreq)

        self.log_features = ops.ToDecibels(device="cpu", multiplier=np.log(10), reference=1.0,
                                           cutoff_db=-80)

        self.get_shape = ops.Shapes(device="cpu")

        self.normalize = ops.Normalize(axes=[0], device="cpu")

        self.splicing_transpose = ops.Transpose(device="cpu", perm=[1, 0])
        self.splicing_reshape = ops.Reshape(device="cpu", rel_shape=[-1, frame_splicing_factor])
        self.splicing_pad = ops.Pad(axes=[0], fill_value=0, align=frame_splicing_factor, shape=[1],
                                    device="cpu")

        self.get_nonsilent_region = ops.NonsilentRegion(device="cpu", cutoff_db=silence_threshold)
        self.trim_silence = ops.Slice(device="cpu", axes=[0])
        self.to_float = ops.Cast(dtype=types.FLOAT)
Ejemplo n.º 6
0
    def __init__(self,
                 batch_size,
                 num_threads,
                 device_id,
                 data_dir,
                 dali_cpu=True):
        super(AudioTrainPipe, self).__init__(batch_size,
                                             num_threads,
                                             device_id,
                                             seed=12 + device_id)
        #self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, shuffle_after_epoch=True, cache_size=5500)
        #self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, random_shuffle=True)
        shard = int(args.node_rank * args.world_size / args.nnodes +
                    args.local_rank)
        if args.dist_mint:
            print(
                " DIST MINT : Shard id : {}, num_shards:{}, node_id :{}, num_nodes :{}"
                .format(shard, args.world_size, args.node_rank, args.nnodes))
            self.input = ops.FileReader(file_root=data_dir,
                                        shard_id=shard,
                                        num_shards=args.world_size,
                                        shuffle_after_epoch=True,
                                        num_nodes=args.nnodes,
                                        node_id=args.node_rank,
                                        cache_size=args.cache_size,
                                        node_port_list=args.node_port_list,
                                        node_ip_list=args.node_ip_list)
        else:
            self.input = ops.FileReader(file_root=data_dir,
                                        shard_id=shard,
                                        num_shards=args.world_size,
                                        shuffle_after_epoch=True,
                                        cache_size=args.cache_size)

        dali_device = 'cpu' if dali_cpu else 'gpu'
        decoder_device = 'cpu' if dali_cpu else 'mixed'
        self.decode = ops.AudioDecoder(device="cpu",
                                       dtype=types.FLOAT,
                                       downmix=True,
                                       sample_rate=8192,
                                       downsample_size=160000)
        #sample_rate=8192)
        print('DALI "{0}" variant'.format(dali_device))
Ejemplo n.º 7
0
 def __init__(self,
              batch_size,
              num_threads,
              device_id,
              data_dir,
              dali_cpu=True):
     super(AudioValPipe, self).__init__(batch_size,
                                        num_threads,
                                        device_id,
                                        seed=12 + device_id)
     shard = int(args.node_rank * args.world_size / args.nnodes +
                 args.local_rank)
     self.input = ops.FileReader(file_root=data_dir,
                                 shard_id=shard,
                                 num_shards=args.world_size,
                                 random_shuffle=False)
     dali_device = 'cpu' if dali_cpu else 'gpu'
     decoder_device = 'cpu' if dali_cpu else 'mixed'
     self.decode = ops.AudioDecoder(device="cpu",
                                    dtype=types.FLOAT,
                                    downmix=True,
                                    sample_rate=8192,
                                    downsample_size=160000)
Ejemplo n.º 8
0
    def __init__(
            self,
            *,
            train_pipeline:
        bool,  # True if train pipeline, False if validation pipeline
            device_id,
            num_threads,
            batch_size,
            file_root: str,
            file_list: str,
            sample_rate,
            discrete_resample_range: bool,
            resample_range: list,
            window_size,
            window_stride,
            nfeatures,
            nfft,
            frame_splicing_factor,
            dither_coeff,
            silence_threshold,
            preemph_coeff,
            pad_align,
            max_duration,
            mask_time_num_regions,
            mask_time_min,
            mask_time_max,
            mask_freq_num_regions,
            mask_freq_min,
            mask_freq_max,
            mask_both_num_regions,
            mask_both_min_time,
            mask_both_max_time,
            mask_both_min_freq,
            mask_both_max_freq,
            preprocessing_device="gpu"):
        super().__init__(batch_size, num_threads, device_id)

        self._dali_init_log(locals())

        if torch.distributed.is_initialized():
            shard_id = torch.distributed.get_rank()
            n_shards = torch.distributed.get_world_size()
        else:
            shard_id = 0
            n_shards = 1

        self.preprocessing_device = preprocessing_device.lower()
        assert self.preprocessing_device == "cpu" or self.preprocessing_device == "gpu", \
            "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'"
        self.frame_splicing_factor = frame_splicing_factor
        assert frame_splicing_factor == 1, "DALI doesn't support frame splicing operation"

        self.resample_range = resample_range
        self.discrete_resample_range = discrete_resample_range

        self.train = train_pipeline
        self.sample_rate = sample_rate
        self.dither_coeff = dither_coeff
        self.nfeatures = nfeatures
        self.max_duration = max_duration
        self.mask_params = {
            'time_num_regions': mask_time_num_regions,
            'time_min': mask_time_min,
            'time_max': mask_time_max,
            'freq_num_regions': mask_freq_num_regions,
            'freq_min': mask_freq_min,
            'freq_max': mask_freq_max,
            'both_num_regions': mask_both_num_regions,
            'both_min_time': mask_both_min_time,
            'both_max_time': mask_both_max_time,
            'both_min_freq': mask_both_min_freq,
            'both_max_freq': mask_both_max_freq,
        }
        self.do_remove_silence = True if silence_threshold is not None else False

        self.read = ops.FileReader(device="cpu",
                                   file_root=file_root,
                                   file_list=file_list,
                                   shard_id=shard_id,
                                   num_shards=n_shards,
                                   shuffle_after_epoch=train_pipeline)

        # TODO change ExternalSource to Uniform for new DALI release
        if discrete_resample_range and resample_range is not None:
            self.speed_perturbation_coeffs = ops.ExternalSource(
                device="cpu",
                cycle=True,
                source=self._discrete_resample_coeffs_generator)
        elif resample_range is not None:
            self.speed_perturbation_coeffs = random.Uniform(
                device="cpu", range=resample_range)
        else:
            self.speed_perturbation_coeffs = None

        self.decode = ops.AudioDecoder(
            device="cpu",
            sample_rate=self.sample_rate if resample_range is None else None,
            dtype=types.FLOAT,
            downmix=True)

        self.normal_distribution = random.Normal(device=preprocessing_device)

        self.preemph = ops.PreemphasisFilter(device=preprocessing_device,
                                             preemph_coeff=preemph_coeff)

        self.spectrogram = ops.Spectrogram(
            device=preprocessing_device,
            nfft=nfft,
            window_length=window_size * sample_rate,
            window_step=window_stride * sample_rate)

        self.mel_fbank = ops.MelFilterBank(device=preprocessing_device,
                                           sample_rate=sample_rate,
                                           nfilter=self.nfeatures,
                                           normalize=True)

        self.log_features = ops.ToDecibels(device=preprocessing_device,
                                           multiplier=np.log(10),
                                           reference=1.0,
                                           cutoff_db=math.log(1e-20))

        self.get_shape = ops.Shapes(device=preprocessing_device)

        self.normalize = ops.Normalize(device=preprocessing_device, axes=[1])

        self.pad = ops.Pad(device=preprocessing_device,
                           axes=[1],
                           fill_value=0,
                           align=pad_align)

        # Silence trimming
        self.get_nonsilent_region = ops.NonsilentRegion(
            device="cpu", cutoff_db=silence_threshold)
        self.trim_silence = ops.Slice(device="cpu",
                                      normalized_anchor=False,
                                      normalized_shape=False,
                                      axes=[0])
        self.to_float = ops.Cast(device="cpu", dtype=types.FLOAT)

        # Spectrogram masking
        self.spectrogram_cutouts = ops.ExternalSource(
            source=self._cutouts_generator, num_outputs=2, cycle=True)
        self.mask_spectrogram = ops.Erase(device=preprocessing_device,
                                          axes=[0, 1],
                                          fill_value=0,
                                          normalized_anchor=True)
Ejemplo n.º 9
0
    def __init__(self,
                 *,
                 pipeline_type,
                 device_id,
                 num_threads,
                 batch_size,
                 file_root: str,
                 sampler,
                 sample_rate,
                 resample_range: list,
                 window_size,
                 window_stride,
                 nfeatures,
                 nfft,
                 dither_coeff,
                 silence_threshold,
                 preemph_coeff,
                 max_duration,
                 preprocessing_device="gpu"):
        super().__init__(batch_size, num_threads, device_id)

        self._dali_init_log(locals())

        if torch.distributed.is_initialized():
            shard_id = torch.distributed.get_rank()
            n_shards = torch.distributed.get_world_size()
        else:
            shard_id = 0
            n_shards = 1

        self.preprocessing_device = preprocessing_device.lower()
        assert self.preprocessing_device == "cpu" or self.preprocessing_device == "gpu", \
            "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'"

        self.resample_range = resample_range

        train_pipeline = pipeline_type == 'train'
        self.train = train_pipeline
        self.sample_rate = sample_rate
        self.dither_coeff = dither_coeff
        self.nfeatures = nfeatures
        self.max_duration = max_duration
        self.do_remove_silence = True if silence_threshold is not None else False

        shuffle = train_pipeline and not sampler.is_sampler_random()
        self.read = ops.FileReader(name="Reader",
                                   pad_last_batch=(pipeline_type == 'val'),
                                   device="cpu",
                                   file_root=file_root,
                                   file_list=sampler.get_file_list_path(),
                                   shard_id=shard_id,
                                   num_shards=n_shards,
                                   shuffle_after_epoch=shuffle)

        # TODO change ExternalSource to Uniform for new DALI release
        if resample_range is not None:
            self.speed_perturbation_coeffs = ops.Uniform(device="cpu",
                                                         range=resample_range)
        else:
            self.speed_perturbation_coeffs = None

        self.decode = ops.AudioDecoder(
            device="cpu",
            sample_rate=self.sample_rate if resample_range is None else None,
            dtype=types.FLOAT,
            downmix=True)

        self.normal_distribution = ops.NormalDistribution(
            device=preprocessing_device)

        self.preemph = ops.PreemphasisFilter(device=preprocessing_device,
                                             preemph_coeff=preemph_coeff)

        self.spectrogram = ops.Spectrogram(
            device=preprocessing_device,
            nfft=nfft,
            window_length=window_size * sample_rate,
            window_step=window_stride * sample_rate)

        self.mel_fbank = ops.MelFilterBank(device=preprocessing_device,
                                           sample_rate=sample_rate,
                                           nfilter=self.nfeatures,
                                           normalize=True)

        self.log_features = ops.ToDecibels(device=preprocessing_device,
                                           multiplier=np.log(10),
                                           reference=1.0,
                                           cutoff_db=math.log(1e-20))

        self.get_shape = ops.Shapes(device=preprocessing_device)

        self.normalize = ops.Normalize(device=preprocessing_device, axes=[1])

        self.pad = ops.Pad(device=preprocessing_device, fill_value=0)

        # Silence trimming
        self.get_nonsilent_region = ops.NonsilentRegion(
            device="cpu", cutoff_db=silence_threshold)
        self.trim_silence = ops.Slice(device="cpu",
                                      normalized_anchor=False,
                                      normalized_shape=False,
                                      axes=[0])
        self.to_float = ops.Cast(device="cpu", dtype=types.FLOAT)