def ExternalSourcePipeline(params, num_threads, device_id, external_date, seed): pipe = Pipeline(params.batch_size, num_threads, device_id, seed=seed) with pipe: jpegs, labels = fn.external_source(source=external_date, num_outputs=2) images = fn.image_decoder(jpegs, device="mixed", output_type=types.RGB) images = fn.resize(images, resize_x=224, resize_y=224) images = fn.cast(images, dtype=types.UINT8) / 255 images = fn.normalize(images, axes=[0, 1], mean=params.mean, stddev=params.std, device='gpu', batch=False) output = fn.transpose(images, perm=[2, 0, 1], device='gpu') pipe.set_outputs(output, labels) return pipe
def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97, window_size=.02, window_stride=.01, window="hann", nfeatures=64, nfft=512, frame_splicing_stack=1, frame_splicing_subsample=1, lowfreq=0.0, highfreq=None, normalize_type='per_feature', speed_perturb=False, silence_trim=False, device='cpu'): assert normalize_type == 'per_feature' or normalize_type == 'all_features' norm_axes = [1] if normalize_type == 'per_feature' else [0, 1] win_len, win_hop = win_args(sample_rate, window_size, window_stride) window_fn = torch_windows.get(window, None) window_fn_arg = window_fn( win_len, periodic=False).numpy().tolist() if window_fn else None data, _ = fn.readers.file(files=files, device="cpu", random_shuffle=False, shard_id=0, num_shards=1) audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True) # splicing with subsampling doesn't work if audio_len is a GPU data node if device == 'gpu' and frame_splicing_subsample == 1: audio = audio.gpu() # Speed perturbation 0.85x - 1.15x if speed_perturb: target_sr_factor = fn.random.uniform(device="cpu", range=(1 / 1.15, 1 / 0.85)) audio = fn.experimental.audio_resample(audio, scale=target_sr_factor) # Silence trimming if silence_trim: begin, length = fn.nonsilent_region(audio, cutoff_db=-80) audio = fn.slice(audio, begin, length, axes=[0]) audio_shape = fn.shapes(audio, dtype=types.INT32) orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, )) # If we couldn't move to GPU earlier, do it now if device == 'gpu' and frame_splicing_subsample > 1: audio = audio.gpu() if pad_amount > 0: audio_len = orig_audio_len + 2 * pad_amount padded_audio = dali_reflect_pad_graph(audio, orig_audio_len, pad_amount) else: audio_len = orig_audio_len padded_audio = audio # Preemphasis filter preemph_audio = fn.preemphasis_filter(padded_audio, preemph_coeff=preemph_coeff, border='zero') # Spectrogram spec_len = audio_len // win_hop + 1 spec = fn.spectrogram(preemph_audio, nfft=nfft, window_fn=window_fn_arg, window_length=win_len, window_step=win_hop, center_windows=True, reflect_padding=True) # Mel spectrogram mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, freq_low=lowfreq, freq_high=highfreq) # Log log_features = fn.to_decibels(mel_spec + 1e-20, multiplier=np.log(10), reference=1.0, cutoff_db=-80) # Frame splicing if frame_splicing_stack > 1 or frame_splicing_subsample > 1: log_features_spliced = dali_frame_splicing_graph( log_features, nfeatures, spec_len, stacking=frame_splicing_stack, subsampling=frame_splicing_subsample) else: log_features_spliced = log_features # Normalization if normalize_type: norm_log_features = fn.normalize(log_features_spliced, axes=norm_axes, device=device, epsilon=4e-5, ddof=1) else: norm_log_features = log_features_spliced return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
bytes_per_sample_hint=ImageBytes, sigma=fn.uniform(range=(0.1, 2)), window_size=11) images = fn.color_twist(images, device='gpu', bytes_per_sample_hint=ImageBytes, brightness=fn.uniform(range=(0.5, 1.5)), contrast=fn.uniform(range=(0.5, 2.5)), saturation=fn.uniform(range=(0.1, 2))) images = fn.cast(images, device='gpu', bytes_per_sample_hint=ImageBytes, dtype=DALIDataType.FLOAT) images = fn.normalize( images, device='gpu', bytes_per_sample_hint=ImageBytes, mean=Constant(numpy.array([[[190.6380, 207.2640, 202.5720]]])), stddev=Constant(numpy.array([[[85.2720, 68.6970, 81.4215]]]))) images = fn.transpose(images, device='gpu', bytes_per_sample_hint=ImageBytes, perm=[2, 0, 1]) TestingPipe.set_outputs(images, labels) TestingLoader = DALIClassificationIterator(TestingPipe, size=1000 * args.bs) model_top = torch.load(args.top) model_top.eval() model_bottom = torch.load(args.bottom) model_bottom.eval() torch.backends.cudnn.benchmark = True
def dali_asr_pipeline(train_pipeline, # True if training, False if validation file_root, file_list, sample_rate, silence_threshold, resample_range, discrete_resample_range, window_size, window_stride, nfeatures, nfft, frame_splicing_factor, dither_coeff, pad_align, preemph_coeff, do_spectrogram_masking=False, cutouts_generator=None, shard_id=0, n_shards=1, preprocessing_device="gpu"): do_remove_silence = silence_threshold is not None def _div_ceil(dividend, divisor): return (dividend + (divisor - 1)) // divisor encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [resample_range[0], 1.0, resample_range[1]] speed_perturbation_coeffs = fn.random.uniform(device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform(device="cpu", range=resample_range) if train_pipeline and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate elif resample_range is None: dec_sample_rate_arg = sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if do_remove_silence: begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if preprocessing_device == "gpu": audio = audio.gpu() if dither_coeff != 0.: audio = audio + fn.random.normal(audio) * dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if frame_splicing_factor != 1: log_features_len = _div_ceil(log_features_len, frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if train_pipeline and do_spectrogram_masking: anchors, shapes = fn.external_source(source=cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` # in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()
def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97, window_size=.02, window_stride=.01, window="hann", nfeatures=64, nfft=512, frame_splicing_stack=1, frame_splicing_subsample=1, lowfreq=0.0, highfreq=None, normalize_type='per_feature', device='cpu'): assert normalize_type == 'per_feature' or normalize_type == 'all_features' norm_axes = [1] if normalize_type == 'per_feature' else [0, 1] win_len, win_hop = win_args(sample_rate, window_size, window_stride) window_fn = torch_windows.get(window, None) window_fn_arg = window_fn( win_len, periodic=False).numpy().tolist() if window_fn else None data, _ = fn.readers.file(files=files, device="cpu", random_shuffle=False, shard_id=0, num_shards=1) audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True) audio_shape = fn.shapes(audio, dtype=types.INT32) orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, )) if pad_amount > 0: audio_len = orig_audio_len + 2 * pad_amount else: audio_len = orig_audio_len spec_len = audio_len // win_hop + 1 if device == 'gpu': audio = audio.gpu() if pad_amount > 0: padded_audio = dali_reflect_pad_graph(audio, orig_audio_len, pad_amount) else: padded_audio = audio preemph_audio = fn.preemphasis_filter(padded_audio, preemph_coeff=preemph_coeff, border='zero') spec = fn.spectrogram(preemph_audio, nfft=nfft, window_fn=window_fn_arg, window_length=win_len, window_step=win_hop, center_windows=True, reflect_padding=True) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, freq_low=lowfreq, freq_high=highfreq) log_features = fn.to_decibels(mel_spec + 1e-20, multiplier=np.log(10), reference=1.0, cutoff_db=-80) if frame_splicing_stack > 1 or frame_splicing_subsample > 1: log_features_spliced = dali_frame_splicing_graph( log_features, nfeatures, spec_len, stacking=frame_splicing_stack, subsampling=frame_splicing_subsample) else: log_features_spliced = log_features if normalize_type: norm_log_features = fn.normalize(log_features_spliced, axes=norm_axes, device=device, epsilon=4e-5, ddof=1) else: norm_log_features = log_features_spliced return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
def dali_jasper_pipe(): if is_triton_pipeline: assert not self.train, "Pipeline for Triton shall be a validation pipeline" if torch.distributed.is_initialized(): raise RuntimeError( "You're creating Triton pipeline, using multi-process mode. Please use single-process mode." ) encoded, label = fn.external_source(device="cpu", name="DALI_INPUT_0", no_copy=True) else: encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [ self.resample_range[0], 1.0, self.resample_range[1] ] speed_perturbation_coeffs = fn.random.uniform( device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform( device="cpu", range=resample_range) if self.train and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * self.sample_rate elif resample_range is None: dec_sample_rate_arg = self.sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if self.do_remove_silence: begin, length = fn.nonsilent_region( audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if self.preprocessing_device == "gpu": audio = audio.gpu() if self.dither_coeff != 0.: audio = audio + fn.random.normal( device=preprocessing_device) * self.dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=self.nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if self.frame_splicing_factor != 1: log_features_len = self._div_ceil(log_features_len, self.frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if self.train and self._do_spectrogram_masking(): anchors, shapes = fn.external_source( source=self._cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()