def run_decode(data_path, out_type): batch_size = 4 pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=0) input, _ = fn.file_reader(file_root=data_path, shard_id=0, num_shards=1, name="reader") decoded = fn.image_decoder(input, output_type=types.RGB) decoded_shape = fn.shapes(decoded) raw_shape = fn.peek_image_shape(input, type=out_type) pipe.set_outputs(decoded, decoded_shape, raw_shape) pipe.build() samples = 0 length = pipe.reader_meta(name="reader")['epoch_size'] while samples < length: samples += batch_size (images, decoded_shape, raw_shape) = pipe.run() for i in range(batch_size): # as we are asking for a particular color space it may differ from the source image, so don't compare it image = images.at(i) shape_type = dali_types_to_np(out_type) for d in range(len(image.shape) - 1): assert image.shape[d] == decoded_shape.at( i)[d], "{} vs {}".format(image.shape[d], decoded_shape.at(i)[d]) assert image.shape[d] == raw_shape.at(i)[d], "{} vs {}".format( image.shape[d], raw_shape.at(i)[d]) assert raw_shape.at( i)[d].dtype == shape_type, "{} vs {}".format( raw_shape.at(i)[d].dtyp, shape_type)
def _test_rrc(device, max_frames, layout, aspect_ratio_range, area_range, output_size, input_type, output_type): batch_size = 4 pipe = dali.pipeline.Pipeline(batch_size, 4, 0) channel_dim = layout.find('C') value_range = type_range(test_utils.dali_type_to_np(input_type)) if channel_dim == len(layout) - 1: channel_dim = -1 input = fn.external_source(source=generator(batch_size, max_frames, channel_dim, input_type), layout=layout) shape = fn.shapes(input) if device == "gpu": input = input.gpu() out = fn.random_resized_crop(input, random_aspect_ratio=aspect_ratio_range, random_area=area_range, size=output_size, interp_type=dali.types.INTERP_LINEAR, seed=12321, dtype=output_type) pipe.set_outputs(out, shape) pipe.build() for iter in range(3): outputs, input_shapes = pipe.run() if device == "gpu": outputs = outputs.as_cpu() assert outputs.layout() == layout for i in range(batch_size): out = outputs.at(i) input_shape = input_shapes.at(i).tolist() check_output(out, channel_dim, input_shape, aspect_ratio_range, area_range, value_range)
def crop_fn(self, img, lbl): center = fn.segmentation.random_mask_pixel(lbl, foreground=fn.coin_flip(probability=self.oversampling)) crop_anchor = self.slice_fn(center, 1, self.dim) - self.crop_shape // 2 adjusted_anchor = math.max(0, crop_anchor) max_anchor = self.slice_fn(fn.shapes(lbl), 1, self.dim) - self.crop_shape crop_anchor = math.min(adjusted_anchor, max_anchor) img = fn.slice(img.gpu(), crop_anchor, self.crop_shape, axis_names=self.axis_name, out_of_bounds_policy="pad") lbl = fn.slice(lbl.gpu(), crop_anchor, self.crop_shape, axis_names=self.axis_name, out_of_bounds_policy="pad") return img, lbl
def crop_fn(self, img, lbl): center = fn.segmentation.random_mask_pixel(lbl, foreground=fn.coin_flip(probability=self.oversampling, **self.aug_seed_kwargs), **self.aug_seed_kwargs) crop_anchor = self.slice_fn(center) - self.crop_shape // 2 adjusted_anchor = math.max(0, crop_anchor) max_anchor = self.slice_fn(fn.shapes(lbl)) - self.crop_shape crop_anchor = math.min(adjusted_anchor, max_anchor) img = fn.slice(img, crop_anchor, self.crop_shape, axis_names="DHW", out_of_bounds_policy="pad") lbl = fn.slice(lbl, crop_anchor, self.crop_shape, axis_names="DHW", out_of_bounds_policy="pad") return img, lbl
def audio_decoder_pipe(device): encoded, _ = fn.readers.file(files=names) audio0, sr0 = fn.decoders.audio(encoded, dtype=types.FLOAT) out_sr = 15000 audio1, sr1 = fn.decoders.audio(encoded, dtype=types.FLOAT, sample_rate=out_sr) if device == 'gpu': audio0 = audio0.gpu() audio2 = fn.experimental.audio_resample(audio0, in_rate=sr0, out_rate=out_sr) audio3 = fn.experimental.audio_resample(audio0, scale=out_sr / sr0) audio4 = fn.experimental.audio_resample(audio0, out_length=fn.shapes(audio1)[0]) return audio1, audio2, audio3, audio4
def create_dali_pipe(channel_first, seq_len, interp, dtype, w, h, batch_size=2): pipe = dali.pipeline.Pipeline(batch_size, 1, 0, 0) with pipe: layout = "FCHW" if channel_first else "FHWC" ext = fn.external_source(GetSequences(channel_first, seq_len, batch_size), layout=layout) resize_cpu_out = fn.resize(ext, resize_x=w, resize_y=h, interp_type=interp, dtype=dtype, save_attrs=True) resize_gpu_out = fn.resize(ext.gpu(), resize_x=w, resize_y=h, interp_type=interp, minibatch_size=4, dtype=dtype, save_attrs=True) dali_resized_cpu, size_cpu = resize_cpu_out dali_resized_gpu, size_gpu = resize_gpu_out # extract just HW part from the input shape ext_size = fn.slice(fn.cast(fn.shapes(ext), dtype=types.INT32), 2 if channel_first else 1, 2, axes=[0]) pipe.set_outputs(dali_resized_cpu, dali_resized_gpu, ext_size, size_cpu, size_gpu) return pipe
def create_dali_pipe(channel_first, seq_len, interp, dtype, w, h, batch_size=2): pipe = dali.pipeline.Pipeline(batch_size, 1, 0, 0) with pipe: layout = "FCHW" if channel_first else "FHWC" ext = fn.external_source(GetSequences(channel_first, seq_len, batch_size), layout=layout) resize_cpu_out = fn.resize(ext, resize_x=w, resize_y=h, interp_type=interp, dtype=dtype, save_attrs=True) resize_gpu_out = fn.resize(ext.gpu(), resize_x=w, resize_y=h, interp_type=interp, minibatch_size=4, dtype=dtype, save_attrs=True) dali_resized_cpu, size_cpu = resize_cpu_out dali_resized_gpu, size_gpu = resize_gpu_out # extract just HW part from the input shape shape_anchor = np.array([2 if channel_first else 1], dtype=np.float32) shape_shape = np.array([2], dtype=np.float32) ext_size = fn.slice(fn.cast(fn.shapes(ext), dtype=types.INT32), types.Constant(shape_anchor, device="cpu"), types.Constant(shape_shape, device="cpu"), normalized_anchor=False, normalized_shape=False, axes=[0]) pipe.set_outputs(dali_resized_cpu, dali_resized_gpu, ext_size, size_cpu, size_gpu) return pipe
def check_pad_to_square(device='cpu', batch_size=3, ndim=2, num_iter=3): pipe = Pipeline(batch_size=batch_size, num_threads=3, device_id=0, seed=1234) axes = (0, 1) with pipe: in_shape = fn.cast(fn.random.uniform(range=(10, 20), shape=(ndim, )), dtype=types.INT32) in_data = fn.reshape(fn.random.uniform(range=(0., 1.), shape=in_shape), layout="HW") shape = fn.shapes(in_data, dtype=types.INT32) h = fn.slice(shape, 0, 1, axes=[0]) w = fn.slice(shape, 1, 1, axes=[0]) side = math.max(h, w) if device == 'gpu': in_data = in_data.gpu() out_data = fn.pad(in_data, axis_names="HW", shape=fn.cat(side, side, axis=0)) pipe.set_outputs(in_data, out_data) pipe.build() for _ in range(num_iter): outs = [ out.as_cpu() if isinstance(out, TensorListGPU) else out for out in pipe.run() ] for i in range(batch_size): in_data, out_data = \ [outs[out_idx].at(i) for out_idx in range(len(outs))] in_shape = in_data.shape max_side = max(in_shape) for s in out_data.shape: assert s == max_side np.testing.assert_equal(out_data[:in_shape[0], :in_shape[1]], in_data) np.testing.assert_equal(out_data[in_shape[0]:, :], 0) np.testing.assert_equal(out_data[:, in_shape[1]:], 0)
def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97, window_size=.02, window_stride=.01, window="hann", nfeatures=64, nfft=512, frame_splicing_stack=1, frame_splicing_subsample=1, lowfreq=0.0, highfreq=None, normalize_type='per_feature', speed_perturb=False, silence_trim=False, device='cpu'): assert normalize_type == 'per_feature' or normalize_type == 'all_features' norm_axes = [1] if normalize_type == 'per_feature' else [0, 1] win_len, win_hop = win_args(sample_rate, window_size, window_stride) window_fn = torch_windows.get(window, None) window_fn_arg = window_fn( win_len, periodic=False).numpy().tolist() if window_fn else None data, _ = fn.readers.file(files=files, device="cpu", random_shuffle=False, shard_id=0, num_shards=1) audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True) # splicing with subsampling doesn't work if audio_len is a GPU data node if device == 'gpu' and frame_splicing_subsample == 1: audio = audio.gpu() # Speed perturbation 0.85x - 1.15x if speed_perturb: target_sr_factor = fn.random.uniform(device="cpu", range=(1 / 1.15, 1 / 0.85)) audio = fn.experimental.audio_resample(audio, scale=target_sr_factor) # Silence trimming if silence_trim: begin, length = fn.nonsilent_region(audio, cutoff_db=-80) audio = fn.slice(audio, begin, length, axes=[0]) audio_shape = fn.shapes(audio, dtype=types.INT32) orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, )) # If we couldn't move to GPU earlier, do it now if device == 'gpu' and frame_splicing_subsample > 1: audio = audio.gpu() if pad_amount > 0: audio_len = orig_audio_len + 2 * pad_amount padded_audio = dali_reflect_pad_graph(audio, orig_audio_len, pad_amount) else: audio_len = orig_audio_len padded_audio = audio # Preemphasis filter preemph_audio = fn.preemphasis_filter(padded_audio, preemph_coeff=preemph_coeff, border='zero') # Spectrogram spec_len = audio_len // win_hop + 1 spec = fn.spectrogram(preemph_audio, nfft=nfft, window_fn=window_fn_arg, window_length=win_len, window_step=win_hop, center_windows=True, reflect_padding=True) # Mel spectrogram mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, freq_low=lowfreq, freq_high=highfreq) # Log log_features = fn.to_decibels(mel_spec + 1e-20, multiplier=np.log(10), reference=1.0, cutoff_db=-80) # Frame splicing if frame_splicing_stack > 1 or frame_splicing_subsample > 1: log_features_spliced = dali_frame_splicing_graph( log_features, nfeatures, spec_len, stacking=frame_splicing_stack, subsampling=frame_splicing_subsample) else: log_features_spliced = log_features # Normalization if normalize_type: norm_log_features = fn.normalize(log_features_spliced, axes=norm_axes, device=device, epsilon=4e-5, ddof=1) else: norm_log_features = log_features_spliced return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
def check_normal_distribution(device, dtype, shape=None, use_shape_like_input=False, variable_shape=False, mean=0.0, stddev=1.0, variable_dist_params=False, shape_gen_f=None, niter=3, batch_size=3, device_id=0, num_threads=3): pipe = Pipeline(batch_size=batch_size, device_id=device_id, num_threads=num_threads, seed=123456) with pipe: shape_like_in = None shape_arg = None assert shape is None or shape_gen_f is None if variable_shape: if shape_gen_f is None: def shape_gen_f(): return random_shape(shape) if use_shape_like_input: shape_like_in = fn.external_source( lambda: np.zeros(shape_gen_f()), device=device, batch=False) shape_out = fn.shapes(shape_like_in) else: shape_arg = fn.external_source(shape_gen_f, batch=False) shape_out = shape_arg else: if use_shape_like_input: shape_like_in = np.zeros(shape) else: shape_arg = shape # Can't make an empty list constant shape_out = types.Constant(shape if shape is not None and shape != () else (1, ), dtype=types.INT32) mean_arg = None stddev_arg = None if variable_dist_params: mean_arg = fn.external_source(lambda: np.array( np.random.uniform(low=-100.0, high=100.0), dtype=np.float32), device='cpu', batch=False) stddev_arg = fn.external_source(lambda: np.array( np.random.uniform(low=1.0, high=100.0), dtype=np.float32), device='cpu', batch=False) else: mean_arg = mean stddev_arg = stddev inputs = [shape_like_in] if shape_like_in is not None else [] out = fn.random.normal(*inputs, device=device, shape=shape_arg, mean=mean_arg, stddev=stddev_arg, dtype=dtype) pipe.set_outputs(out, shape_out, mean_arg, stddev_arg) pipe.build() for i in range(niter): outputs = pipe.run() out, shapes, means, stddevs = tuple(outputs[i].as_cpu( ) if isinstance(outputs[i], TensorListGPU) else outputs[i] for i in range(len(outputs))) for sample_idx in range(batch_size): sample = np.array(out[sample_idx]) if sample.shape == (): continue sample_shape = np.array(shapes[sample_idx]) mean = np.array(means[sample_idx]) stddev = np.array(stddevs[sample_idx]) assert (sample.shape == sample_shape ).all(), f"{sample.shape} != {sample_shape}" data = sample.flatten() data_len = len(data) # Checking sanity of the data if data_len >= 100 and dtype in [types.FLOAT, types.FLOAT64]: # Empirical rule: # ~68% of the observations within one standard deviation # ~95% of the observations within one standard deviation # ~99.7% of the observations within one standard deviation within_1stddevs = np.where((data > (mean - 1 * stddev)) & (data < (mean + 1 * stddev))) p1 = len(within_1stddevs[0]) / data_len within_2stddevs = np.where((data > (mean - 2 * stddev)) & (data < (mean + 2 * stddev))) p2 = len(within_2stddevs[0]) / data_len within_3stddevs = np.where((data > (mean - 3 * stddev)) & (data < (mean + 3 * stddev))) p3 = len(within_3stddevs[0]) / data_len assert p3 > 0.9, f"{p3}" # leave some room assert p2 > 0.8, f"{p2}" # leave some room assert p1 > 0.5, f"{p1}" # leave some room # It's not 100% mathematically correct, but makes do in case of this test _, pvalues_anderson, _ = st.anderson(data, dist='norm') assert pvalues_anderson[2] > 0.5
def build_pipes(device, dim, batch_size, channel_first, mode, interp, dtype, w_input, h_input, d_input, use_size_arg, use_size_input, use_roi): dali_pipe = Pipeline(batch_size=batch_size, num_threads=8, device_id=0, seed=1234) with dali_pipe: if dim == 2: files, labels = dali.fn.readers.caffe(path=db_2d_folder, random_shuffle=True) images_cpu = dali.fn.decoders.image(files, device="cpu") else: images_cpu = dali.fn.external_source( source=random_3d_loader(batch_size), layout="DHWC") images_hwc = images_cpu if device == "cpu" else images_cpu.gpu() if channel_first: images = dali.fn.transpose( images_hwc, perm=[3, 0, 1, 2] if dim == 3 else [2, 0, 1], transpose_layout=True) else: images = images_hwc roi_start = None roi_end = None w = None h = None d = None size = None minibatch_size = 2 if dim == 3 else 8 if use_roi: # Calculate absolute RoI in_size = fn.slice(fn.shapes(images_cpu), types.Constant(0, dtype=types.FLOAT, device="cpu"), types.Constant(dim, dtype=types.FLOAT, device="cpu"), axes=[0], normalized_shape=False) roi_start = fn.random.uniform(range=(0, 0.4), shape=[dim ]) * in_size roi_end = fn.random.uniform(range=(0.6, 1.0), shape=[dim ]) * in_size size_range = (10, 200) if dim == 3 else (10, 1000) if use_size_arg: if use_size_input: mask = fn.cast(fn.random.uniform(range=(0.8, 1.9), shape=[dim]), dtype=types.INT32) size = fn.random.uniform(range=size_range, shape=[dim]) * mask else: size = [300, 400] if dim == 2 else [80, 100, 120] resized = resize_dali(images, channel_first, dtype, interp, mode, size, None, None, None, roi_start, roi_end, minibatch_size=minibatch_size, max_size=max_size(dim)) else: if w_input: has_w = fn.random.coin_flip(probability=0.8) w = fn.random.uniform(range=size_range) * has_w else: w = 320 # some fixed value if h_input: has_h = fn.random.coin_flip(probability=0.8) h = fn.random.uniform(range=size_range) * has_h else: h = 240 # some other fixed value if dim >= 3: if d_input: has_d = fn.random.coin_flip(probability=0.8) d = fn.random.uniform(range=size_range) * has_d else: d = 31 # some other fixed value resized = resize_dali(images, channel_first, dtype, interp, mode, None, w, h, d, roi_start, roi_end, minibatch_size=minibatch_size, max_size=max_size(dim)) outputs = [images, resized] if roi_start is not None and roi_end is not None: outputs += [roi_start, roi_end] for x in (d, h, w, size): if x is not None: if isinstance(x, _DataNode): outputs.append(x) else: outputs.append( types.Constant(np.array(x, dtype=np.float32))) dali_pipe.set_outputs(*outputs) pil_pipe = Pipeline(batch_size=batch_size, num_threads=8, device_id=0, exec_async=False, exec_pipelined=False) with pil_pipe: images = fn.external_source(name="images", layout=layout_str(dim, channel_first)) sizes = fn.external_source(name="size") roi_start = fn.external_source(name="roi_start") roi_end = fn.external_source(name="roi_end") resized = resize_PIL(dim, channel_first, dtype, interp, images, sizes, roi_start, roi_end) resized = fn.reshape(resized, layout=layout_str(dim, channel_first)) pil_pipe.set_outputs(resized) dali_pipe.build() pil_pipe.build() return dali_pipe, pil_pipe
def define_graph(self): inputs, bboxes, labels, polygons, vertices = fn.readers.coco( file_root=self.file_root, annotations_file=self.annotation_file, skip_empty=True, shard_id=self.share_id, num_shards=self.num_gpus, ratio=True, ltrb=True, polygon_masks=True, random_shuffle=self.random_shuffle, shuffle_after_epoch=self.shuffle_after_epoch, name="Reader") input_shape = fn.slice(fn.cast(fn.peek_image_shape(inputs), dtype=types.INT32), 0, 2, axes=[0]) h = fn.slice(input_shape, 0, 1, axes=[0], dtype=types.FLOAT) w = fn.slice(input_shape, 1, 1, axes=[0], dtype=types.FLOAT) short_side = math.min(w, h) scale = fn.random.uniform(range=[0.3, 1.]) crop_side = fn.cast(math.ceil(scale * short_side), dtype=types.INT32) crop_shape = fn.cat(crop_side, crop_side) anchor_rel, shape_rel, bboxes, labels, bbox_indices = fn.random_bbox_crop( bboxes, labels, input_shape=input_shape, crop_shape=crop_shape, shape_layout="HW", thresholds=[ 0. ], # No minimum intersection-over-union, for demo purposes allow_no_crop=False, # No-crop is disallowed, for demo purposes seed=-1, # Fixed random seed for deterministic results bbox_layout="xyXY", # left, top, right, back output_bbox_indices= True, # Output indices of the filtered bounding boxes total_num_attempts=1024, ) polygons, vertices = fn.segmentation.select_masks( bbox_indices, polygons, vertices) images = fn.decoders.image_slice(inputs, anchor_rel, shape_rel, normalized_anchor=False, normalized_shape=False, device='mixed') images = fn.color_space_conversion(images, image_type=types.RGB, output_type=types.BGR) MT_1_vertices = fn.transforms.crop(to_start=(0.0, 0.0), to_end=fn.cat(w, h)) MT_2_vertices = fn.transforms.crop(from_start=anchor_rel, from_end=(anchor_rel + shape_rel), to_start=(0.0, 0.0), to_end=(1., 1.)) vertices = fn.coord_transform(fn.coord_transform(vertices, MT=MT_1_vertices), MT=MT_2_vertices) box_like_shape = fn.cat( fn.slice(fn.shapes(bboxes, dtype=types.INT32), 0, 1, axes=[0]), -1) targets = fn.cat(bboxes, fn.reshape(vertices, shape=box_like_shape), axis=1) interp_methods = [ types.INTERP_LINEAR, types.INTERP_CUBIC, types.INTERP_LANCZOS3, types.INTERP_GAUSSIAN, types.INTERP_NN, types.INTERP_TRIANGULAR ] interp_method = fn.random.uniform( values=[int(x) for x in interp_methods], dtype=types.INT32) interp_method = fn.reinterpret(interp_method, dtype=types.INTERP_TYPE) images = fn.resize(images, dtype=types.FLOAT, size=self.input_dim, interp_type=interp_method) labels = labels.gpu() targets = targets.gpu() return (images, targets, labels)
def dali_asr_pipeline(train_pipeline, # True if training, False if validation file_root, file_list, sample_rate, silence_threshold, resample_range, discrete_resample_range, window_size, window_stride, nfeatures, nfft, frame_splicing_factor, dither_coeff, pad_align, preemph_coeff, do_spectrogram_masking=False, cutouts_generator=None, shard_id=0, n_shards=1, preprocessing_device="gpu"): do_remove_silence = silence_threshold is not None def _div_ceil(dividend, divisor): return (dividend + (divisor - 1)) // divisor encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [resample_range[0], 1.0, resample_range[1]] speed_perturbation_coeffs = fn.random.uniform(device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform(device="cpu", range=resample_range) if train_pipeline and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate elif resample_range is None: dec_sample_rate_arg = sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if do_remove_silence: begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if preprocessing_device == "gpu": audio = audio.gpu() if dither_coeff != 0.: audio = audio + fn.random.normal(audio) * dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if frame_splicing_factor != 1: log_features_len = _div_ceil(log_features_len, frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if train_pipeline and do_spectrogram_masking: anchors, shapes = fn.external_source(source=cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` # in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()
def rnnt_train_pipe(files, sample_rate, pad_amount=0, preemph_coeff=.97, window_size=.02, window_stride=.01, window="hann", nfeatures=64, nfft=512, frame_splicing_stack=1, frame_splicing_subsample=1, lowfreq=0.0, highfreq=None, normalize_type='per_feature', device='cpu'): assert normalize_type == 'per_feature' or normalize_type == 'all_features' norm_axes = [1] if normalize_type == 'per_feature' else [0, 1] win_len, win_hop = win_args(sample_rate, window_size, window_stride) window_fn = torch_windows.get(window, None) window_fn_arg = window_fn( win_len, periodic=False).numpy().tolist() if window_fn else None data, _ = fn.readers.file(files=files, device="cpu", random_shuffle=False, shard_id=0, num_shards=1) audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True) audio_shape = fn.shapes(audio, dtype=types.INT32) orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, )) if pad_amount > 0: audio_len = orig_audio_len + 2 * pad_amount else: audio_len = orig_audio_len spec_len = audio_len // win_hop + 1 if device == 'gpu': audio = audio.gpu() if pad_amount > 0: padded_audio = dali_reflect_pad_graph(audio, orig_audio_len, pad_amount) else: padded_audio = audio preemph_audio = fn.preemphasis_filter(padded_audio, preemph_coeff=preemph_coeff, border='zero') spec = fn.spectrogram(preemph_audio, nfft=nfft, window_fn=window_fn_arg, window_length=win_len, window_step=win_hop, center_windows=True, reflect_padding=True) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=nfeatures, freq_low=lowfreq, freq_high=highfreq) log_features = fn.to_decibels(mel_spec + 1e-20, multiplier=np.log(10), reference=1.0, cutoff_db=-80) if frame_splicing_stack > 1 or frame_splicing_subsample > 1: log_features_spliced = dali_frame_splicing_graph( log_features, nfeatures, spec_len, stacking=frame_splicing_stack, subsampling=frame_splicing_subsample) else: log_features_spliced = log_features if normalize_type: norm_log_features = fn.normalize(log_features_spliced, axes=norm_axes, device=device, epsilon=4e-5, ddof=1) else: norm_log_features = log_features_spliced return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
def dali_jasper_pipe(): if is_triton_pipeline: assert not self.train, "Pipeline for Triton shall be a validation pipeline" if torch.distributed.is_initialized(): raise RuntimeError( "You're creating Triton pipeline, using multi-process mode. Please use single-process mode." ) encoded, label = fn.external_source(device="cpu", name="DALI_INPUT_0", no_copy=True) else: encoded, label = fn.readers.file( device="cpu", name="file_reader", file_root=file_root, file_list=file_list, shard_id=shard_id, num_shards=n_shards, shuffle_after_epoch=train_pipeline) speed_perturbation_coeffs = None if resample_range is not None: if discrete_resample_range: values = [ self.resample_range[0], 1.0, self.resample_range[1] ] speed_perturbation_coeffs = fn.random.uniform( device="cpu", values=values) else: speed_perturbation_coeffs = fn.random.uniform( device="cpu", range=resample_range) if self.train and speed_perturbation_coeffs is not None: dec_sample_rate_arg = speed_perturbation_coeffs * self.sample_rate elif resample_range is None: dec_sample_rate_arg = self.sample_rate else: dec_sample_rate_arg = None audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg, dtype=types.FLOAT, downmix=True) if self.do_remove_silence: begin, length = fn.nonsilent_region( audio, cutoff_db=silence_threshold) audio = fn.slice(audio, begin, length, axes=[0]) # Max duration drop is performed at DataLayer stage if self.preprocessing_device == "gpu": audio = audio.gpu() if self.dither_coeff != 0.: audio = audio + fn.random.normal( device=preprocessing_device) * self.dither_coeff audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff) spec = fn.spectrogram(audio, nfft=nfft, window_length=window_size * sample_rate, window_step=window_stride * sample_rate) mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate, nfilter=self.nfeatures, normalize=True) log_features = fn.to_decibels(mel_spec, multiplier=np.log(10), reference=1.0, cutoff_db=math.log(1e-20)) log_features_len = fn.shapes(log_features) if self.frame_splicing_factor != 1: log_features_len = self._div_ceil(log_features_len, self.frame_splicing_factor) log_features = fn.normalize(log_features, axes=[1]) log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align) if self.train and self._do_spectrogram_masking(): anchors, shapes = fn.external_source( source=self._cutouts_generator, num_outputs=2, cycle=True) log_features = fn.erase(log_features, anchor=anchors, shape=shapes, axes=[0, 1], fill_value=0, normalized_anchor=True) # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation return log_features.gpu(), label.gpu(), log_features_len.gpu()