def __init__(self): super(DecoderPipeline, self).__init__(batch_size=8, num_threads=3, device_id=0, exec_async=True, exec_pipelined=True) self.file_source = ops.ExternalSource() self.plain_decoder = ops.AudioDecoder(dtype=types.INT16) self.resampling_decoder = ops.AudioDecoder(sample_rate=rate1, dtype=types.INT16) self.downmixing_decoder = ops.AudioDecoder(downmix=True, dtype=types.INT16) self.resampling_downmixing_decoder = ops.AudioDecoder( sample_rate=rate2, downmix=True, quality=50, dtype=types.FLOAT)
def __init__(self, batch_size, num_threads=1, exec_async=True, exec_pipelined=True): super(NonsilencePipeline, self).__init__(batch_size, num_threads, 0, seed=42, exec_async=exec_async, exec_pipelined=exec_pipelined) self.input = ops.FileReader(device="cpu", file_root=audio_files) self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True) self.nonsilence = None
def __init__(self, device, batch_size, nfft, window_length, window_step, num_threads=1, device_id=0): super(AudioSpectrogramPipeline, self).__init__(batch_size, num_threads, device_id) self.input = ops.FileReader(device="cpu", file_root=audio_files) self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True) self.fft = ops.Spectrogram(device=device, nfft=nfft, window_length=window_length, window_step=window_step, power=2)
def __init__(self, batch_size, nfft, window_length, window_step, num_threads=1, device_id=0, spectrogram_func=spectrogram_func_librosa): super(AudioSpectrogramPythonPipeline, self).__init__( batch_size, num_threads, device_id, seed=12345, exec_async=False, exec_pipelined=False) self.input = ops.FileReader(device="cpu", file_root=audio_files) self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True) function = partial(spectrogram_func, nfft, window_length, window_step, None) self.spectrogram = ops.PythonFunction(function=function)
def __init__(self, device_id, n_devices, file_root, file_list, batch_size, sample_rate=16000, window_size=.02, window_stride=.01, nfeatures=64, nfft=512, frame_splicing_factor=3, silence_threshold=-80, dither=.00001, preemph_coeff=.97, lowfreq=0.0, highfreq=0.0, num_threads=1): super().__init__(batch_size, num_threads, device_id, seed=42) self.dither = dither self.frame_splicing_factor = frame_splicing_factor self.read = ops.readers.File(file_root=file_root, file_list=file_list, device="cpu", shard_id=device_id, num_shards=n_devices) self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True) self.normal_distribution = ops.random.Normal(device="cpu") self.preemph = ops.PreemphasisFilter(preemph_coeff=preemph_coeff) self.spectrogram = ops.Spectrogram(device="cpu", nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) self.mel_fbank = ops.MelFilterBank(device="cpu", sample_rate=sample_rate, nfilter=nfeatures, normalize=True, freq_low=lowfreq, freq_high=highfreq) self.log_features = ops.ToDecibels(device="cpu", multiplier=np.log(10), reference=1.0, cutoff_db=-80) self.get_shape = ops.Shapes(device="cpu") self.normalize = ops.Normalize(axes=[0], device="cpu") self.splicing_transpose = ops.Transpose(device="cpu", perm=[1, 0]) self.splicing_reshape = ops.Reshape(device="cpu", rel_shape=[-1, frame_splicing_factor]) self.splicing_pad = ops.Pad(axes=[0], fill_value=0, align=frame_splicing_factor, shape=[1], device="cpu") self.get_nonsilent_region = ops.NonsilentRegion(device="cpu", cutoff_db=silence_threshold) self.trim_silence = ops.Slice(device="cpu", axes=[0]) self.to_float = ops.Cast(dtype=types.FLOAT)
def __init__(self, batch_size, num_threads, device_id, data_dir, dali_cpu=True): super(AudioTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) #self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, shuffle_after_epoch=True, cache_size=5500) #self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, random_shuffle=True) shard = int(args.node_rank * args.world_size / args.nnodes + args.local_rank) if args.dist_mint: print( " DIST MINT : Shard id : {}, num_shards:{}, node_id :{}, num_nodes :{}" .format(shard, args.world_size, args.node_rank, args.nnodes)) self.input = ops.FileReader(file_root=data_dir, shard_id=shard, num_shards=args.world_size, shuffle_after_epoch=True, num_nodes=args.nnodes, node_id=args.node_rank, cache_size=args.cache_size, node_port_list=args.node_port_list, node_ip_list=args.node_ip_list) else: self.input = ops.FileReader(file_root=data_dir, shard_id=shard, num_shards=args.world_size, shuffle_after_epoch=True, cache_size=args.cache_size) dali_device = 'cpu' if dali_cpu else 'gpu' decoder_device = 'cpu' if dali_cpu else 'mixed' self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True, sample_rate=8192, downsample_size=160000) #sample_rate=8192) print('DALI "{0}" variant'.format(dali_device))
def __init__(self, batch_size, num_threads, device_id, data_dir, dali_cpu=True): super(AudioValPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) shard = int(args.node_rank * args.world_size / args.nnodes + args.local_rank) self.input = ops.FileReader(file_root=data_dir, shard_id=shard, num_shards=args.world_size, random_shuffle=False) dali_device = 'cpu' if dali_cpu else 'gpu' decoder_device = 'cpu' if dali_cpu else 'mixed' self.decode = ops.AudioDecoder(device="cpu", dtype=types.FLOAT, downmix=True, sample_rate=8192, downsample_size=160000)
def __init__( self, *, train_pipeline: bool, # True if train pipeline, False if validation pipeline device_id, num_threads, batch_size, file_root: str, file_list: str, sample_rate, discrete_resample_range: bool, resample_range: list, window_size, window_stride, nfeatures, nfft, frame_splicing_factor, dither_coeff, silence_threshold, preemph_coeff, pad_align, max_duration, mask_time_num_regions, mask_time_min, mask_time_max, mask_freq_num_regions, mask_freq_min, mask_freq_max, mask_both_num_regions, mask_both_min_time, mask_both_max_time, mask_both_min_freq, mask_both_max_freq, preprocessing_device="gpu"): super().__init__(batch_size, num_threads, device_id) self._dali_init_log(locals()) if torch.distributed.is_initialized(): shard_id = torch.distributed.get_rank() n_shards = torch.distributed.get_world_size() else: shard_id = 0 n_shards = 1 self.preprocessing_device = preprocessing_device.lower() assert self.preprocessing_device == "cpu" or self.preprocessing_device == "gpu", \ "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'" self.frame_splicing_factor = frame_splicing_factor assert frame_splicing_factor == 1, "DALI doesn't support frame splicing operation" self.resample_range = resample_range self.discrete_resample_range = discrete_resample_range self.train = train_pipeline self.sample_rate = sample_rate self.dither_coeff = dither_coeff self.nfeatures = nfeatures self.max_duration = max_duration self.mask_params = { 'time_num_regions': mask_time_num_regions, 'time_min': mask_time_min, 'time_max': mask_time_max, 'freq_num_regions': mask_freq_num_regions, 'freq_min': mask_freq_min, 'freq_max': mask_freq_max, 'both_num_regions': mask_both_num_regions, 'both_min_time': mask_both_min_time, 'both_max_time': mask_both_max_time, 'both_min_freq': mask_both_min_freq, 'both_max_freq': mask_both_max_freq, } self.do_remove_silence = True if silence_threshold is not None else False self.read = ops.FileReader(device="cpu", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) # TODO change ExternalSource to Uniform for new DALI release if discrete_resample_range and resample_range is not None: self.speed_perturbation_coeffs = ops.ExternalSource( device="cpu", cycle=True, source=self._discrete_resample_coeffs_generator) elif resample_range is not None: self.speed_perturbation_coeffs = random.Uniform( device="cpu", range=resample_range) else: self.speed_perturbation_coeffs = None self.decode = ops.AudioDecoder( device="cpu", sample_rate=self.sample_rate if resample_range is None else None, dtype=types.FLOAT, downmix=True) self.normal_distribution = random.Normal(device=preprocessing_device) self.preemph = ops.PreemphasisFilter(device=preprocessing_device, preemph_coeff=preemph_coeff) self.spectrogram = ops.Spectrogram( device=preprocessing_device, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) self.mel_fbank = ops.MelFilterBank(device=preprocessing_device, sample_rate=sample_rate, nfilter=self.nfeatures, normalize=True) self.log_features = ops.ToDecibels(device=preprocessing_device, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) self.get_shape = ops.Shapes(device=preprocessing_device) self.normalize = ops.Normalize(device=preprocessing_device, axes=[1]) self.pad = ops.Pad(device=preprocessing_device, axes=[1], fill_value=0, align=pad_align) # Silence trimming self.get_nonsilent_region = ops.NonsilentRegion( device="cpu", cutoff_db=silence_threshold) self.trim_silence = ops.Slice(device="cpu", normalized_anchor=False, normalized_shape=False, axes=[0]) self.to_float = ops.Cast(device="cpu", dtype=types.FLOAT) # Spectrogram masking self.spectrogram_cutouts = ops.ExternalSource( source=self._cutouts_generator, num_outputs=2, cycle=True) self.mask_spectrogram = ops.Erase(device=preprocessing_device, axes=[0, 1], fill_value=0, normalized_anchor=True)
def __init__(self, *, pipeline_type, device_id, num_threads, batch_size, file_root: str, sampler, sample_rate, resample_range: list, window_size, window_stride, nfeatures, nfft, dither_coeff, silence_threshold, preemph_coeff, max_duration, preprocessing_device="gpu"): super().__init__(batch_size, num_threads, device_id) self._dali_init_log(locals()) if torch.distributed.is_initialized(): shard_id = torch.distributed.get_rank() n_shards = torch.distributed.get_world_size() else: shard_id = 0 n_shards = 1 self.preprocessing_device = preprocessing_device.lower() assert self.preprocessing_device == "cpu" or self.preprocessing_device == "gpu", \ "Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'" self.resample_range = resample_range train_pipeline = pipeline_type == 'train' self.train = train_pipeline self.sample_rate = sample_rate self.dither_coeff = dither_coeff self.nfeatures = nfeatures self.max_duration = max_duration self.do_remove_silence = True if silence_threshold is not None else False shuffle = train_pipeline and not sampler.is_sampler_random() self.read = ops.FileReader(name="Reader", pad_last_batch=(pipeline_type == 'val'), device="cpu", file_root=file_root, file_list=sampler.get_file_list_path(), shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=shuffle) # TODO change ExternalSource to Uniform for new DALI release if resample_range is not None: self.speed_perturbation_coeffs = ops.Uniform(device="cpu", range=resample_range) else: self.speed_perturbation_coeffs = None self.decode = ops.AudioDecoder( device="cpu", sample_rate=self.sample_rate if resample_range is None else None, dtype=types.FLOAT, downmix=True) self.normal_distribution = ops.NormalDistribution( device=preprocessing_device) self.preemph = ops.PreemphasisFilter(device=preprocessing_device, preemph_coeff=preemph_coeff) self.spectrogram = ops.Spectrogram( device=preprocessing_device, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) self.mel_fbank = ops.MelFilterBank(device=preprocessing_device, sample_rate=sample_rate, nfilter=self.nfeatures, normalize=True) self.log_features = ops.ToDecibels(device=preprocessing_device, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) self.get_shape = ops.Shapes(device=preprocessing_device) self.normalize = ops.Normalize(device=preprocessing_device, axes=[1]) self.pad = ops.Pad(device=preprocessing_device, fill_value=0) # Silence trimming self.get_nonsilent_region = ops.NonsilentRegion( device="cpu", cutoff_db=silence_threshold) self.trim_silence = ops.Slice(device="cpu", normalized_anchor=False, normalized_shape=False, axes=[0]) self.to_float = ops.Cast(device="cpu", dtype=types.FLOAT)