def test_pytorch_iterator_not_fill_last_batch_pad_last_batch(): from nvidia.dali.plugin.pytorch import DALIGenericIterator as PyTorchIterator num_gpus = 1 batch_size = 100 pipes, data_size = create_pipeline(lambda gpu: COCOReaderPipeline(batch_size=batch_size, num_threads=4, shard_id=gpu, num_gpus=num_gpus, data_paths=data_sets[0], random_shuffle=False, stick_to_shard=False, shuffle_after_epoch=False, pad_last_batch=True), batch_size, num_gpus) dali_train_iter = PyTorchIterator(pipes, output_map=["data"], size=pipes[0].epoch_size("Reader"), fill_last_batch=False, last_batch_padded=True) img_ids_list, img_ids_list_set, mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: x["data"].squeeze().numpy(), lambda x: 0, data_size) assert len(img_ids_list) == data_size assert len(img_ids_list_set) == data_size assert len(set(mirrored_data)) != 1 dali_train_iter.reset() next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: x["data"].squeeze().numpy(), lambda x: 0, data_size) # there is no mirroring as data in the output is just cut off, # in the mirrored_data there is real data assert len(next_img_ids_list) == data_size assert len(next_img_ids_list_set) == data_size assert len(set(next_mirrored_data)) != 1
class DALIIterableDataset(torchdata.IterableDataset): # type: ignore def __init__(self, pipeline_closure: Callable, metadata: Sized, batch_size: int, *args, **kwargs): super(DALIIterableDataset, self).__init__() self.dali_pipeline = _DALIDataset(pipeline_closure, batch_size, *args, **kwargs) self.iterator = None self.iter = None self.len_metadata = len(metadata) self.index = 0 self.batch_size = batch_size def build(self, variable_names=None): if variable_names is None: variable_names = ['inputs', 'labels'] self.dali_pipeline.build() self.iterator = DALIGenericIterator(self.dali_pipeline, variable_names, self.len_metadata) def __iter__(self): self.dali_pipeline.reset() self.iterator.reset() self.iter = iter(self.iterator) return self # TODO: multiple workers, see https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset def __next__(self): # StopIteration bubbles up from iter self.index += 1 batch = next(self.iter)[0] # default: {'inputs': inputs, 'labels': labels} return batch def __len__(self): return self.len_metadata // self.batch_size
def test_pytorch_iterator_last_batch_pad_last_batch(): num_gpus = 1 batch_size = 100 iters = 0 pipes, data_size = create_pipeline(lambda gpu: COCOReaderPipeline(batch_size=batch_size, num_threads=4, shard_id=gpu, num_gpus=num_gpus, data_paths=data_sets[0], random_shuffle=True, stick_to_shard=False, shuffle_after_epoch=False, pad_last_batch=True), batch_size, num_gpus) dali_train_iter = PyTorchIterator(pipes, output_map=["data"], size=pipes[0].epoch_size("Reader"), fill_last_batch=True) img_ids_list, img_ids_list_set, mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: x["data"].squeeze().numpy(), lambda x: 0, data_size) assert len(img_ids_list) > data_size assert len(img_ids_list_set) == data_size assert len(set(mirrored_data)) == 1 dali_train_iter.reset() next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: x["data"].squeeze().numpy(), lambda x: 0, data_size) assert len(next_img_ids_list) > data_size assert len(next_img_ids_list_set) == data_size assert len(set(next_mirrored_data)) == 1
class AudioToCharDALIDataset(Iterator): """ NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a sample descriptor in JSON, including audio files, transcripts, and durations (in seconds). Here's an example: {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} ... {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths. labels: String containing all the possible characters to map to. sample_rate (int): Sample rate to resample loaded audio to. batch_size (int): Number of samples in a batch. num_threads (int): Number of CPU processing threads to be created by the DALI pipeline. max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files. min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files. blank_index (int): blank character index, default = -1 unk_index (int): unk_character index, default = -1 normalize (bool): whether to normalize transcript text (default): True bos_id (int): Id of beginning of sequence symbol to append if not None eos_id (int): Id of end of sequence symbol to append if not None trim (bool): If True, it will extract the nonsilent region of the loaded audio signal. shuffle (bool): If set to True, the dataset will shuffled after loading. drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size. If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'. device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. """ def __init__( self, manifest_filepath: str, device: str, batch_size: int, labels: Union[str, List[str]], sample_rate: int = 16000, num_threads: int = 4, max_duration: float = 0.0, min_duration: float = 0.0, blank_index: int = -1, unk_index: int = -1, normalize: bool = True, bos_id: Optional[int] = None, eos_id: Optional[int] = None, trim: bool = False, shuffle: bool = True, drop_last: bool = False, parser: Union[str, Callable] = 'en', device_id: int = 0, global_rank: int = 0, world_size: int = 1, preprocessor_cfg: DictConfig = None, ): if not HAVE_DALI: raise ModuleNotFoundError( f"{self} requires NVIDIA DALI to be installed. " f"See: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#id1" ) if device not in ('cpu', 'gpu'): raise ValueError( f"{self} received an unexpected device argument {device}. Supported values are: 'cpu', 'gpu'" ) self.batch_size = batch_size # Used by NeMo self.device = device self.device_id = device_id if world_size > 1: self.shard_id = global_rank self.num_shards = world_size else: self.shard_id = None self.num_shards = None self.labels = labels if self.labels is None or len(self.labels) == 0: raise ValueError(f"{self} expects non empty labels list") self.parser = parsers.make_parser( labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize, ) self.eos_id = eos_id self.bos_id = bos_id self.sample_rate = sample_rate self.pipe = Pipeline( batch_size=batch_size, num_threads=num_threads, device_id=self.device_id, exec_async=True, exec_pipelined=True, ) has_preprocessor = preprocessor_cfg is not None if has_preprocessor: if preprocessor_cfg.cls == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor": feature_type = "mel_spectrogram" elif preprocessor_cfg.cls == "nemo.collections.asr.modules.AudioToMFCCPreprocessor": feature_type = "mfcc" else: raise ValueError( f"{self} received an unexpected preprocessor configuration: {preprocessor_cfg.cls}." f" Supported preprocessors are: AudioToMelSpectrogramPreprocessor, AudioToMFCCPreprocessor" ) # Default values taken from AudioToMelSpectrogramPreprocessor params = preprocessor_cfg.params self.dither = params['dither'] if 'dither' in params else 0.0 self.preemph = params['preemph'] if 'preemph' in params else 0.97 self.window_size_sec = params[ 'window_size'] if 'window_size' in params else 0.02 self.window_stride_sec = params[ 'window_stride'] if 'window_stride' in params else 0.01 self.sample_rate = params[ 'sample_rate'] if 'sample_rate' in params else sample_rate self.window_size = int(self.window_size_sec * self.sample_rate) self.window_stride = int(self.window_size_sec * self.sample_rate) normalize = params[ 'normalize'] if 'normalize' in params else 'per_feature' if normalize == 'per_feature': # Each freq channel independently self.normalization_axes = (1, ) elif normalize == 'all_features': self.normalization_axes = (0, 1) else: raise ValueError( f"{self} received {normalize} for the normalize parameter." f" It must be either 'per_feature' or 'all_features'.") self.window = None window_name = params['window'] if 'window' in params else None torch_windows = { 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, } if window_name is None or window_name == 'hann': self.window = None # Hann is DALI's default elif window_name == 'ones': self.window = torch.ones(self.window_size) else: try: window_fn = torch_windows.get(window_name, None) self.window = window_fn(self.window_size, periodic=False) except: raise ValueError( f"{self} received {window_name} for the window parameter." f" It must be one of: ('hann', 'ones', 'hamming', 'blackman', 'bartlett', None)." f" None is equivalent to 'hann'.") self.n_fft = params[ 'n_fft'] if 'n_fft' in params else None # None means default self.n_mels = params['n_mels'] if 'n_mels' in params else 64 self.n_mfcc = params['n_mfcc'] if 'n_mfcc' in params else 64 features = params['features'] if 'features' in params else 0 if features > 0: if feature_type == 'mel_spectrogram': self.n_mels = features elif feature_type == 'mfcc': self.n_mfcc = features # TODO Implement frame splicing if 'frame_splicing' in params: assert params[ 'frame_splicing'] == 1, "Frame splicing is not implemented" self.freq_low = params['lowfreq'] if 'lowfreq' in params else 0.0 self.freq_high = params[ 'highfreq'] if 'highfreq' in params else self.sample_rate / 2.0 self.log_features = params['log'] if 'log' in params else True # We want to avoid taking the log of zero # There are two options: either adding or clamping to a small value self.log_zero_guard_type = params[ 'log_zero_guard_type'] if 'log_zero_guard_type' in params else 'add' if self.log_zero_guard_type not in ["add", "clamp"]: raise ValueError( f"{self} received {self.log_zero_guard_type} for the " f"log_zero_guard_type parameter. It must be either 'add' or " f"'clamp'.") self.log_zero_guard_value = params[ 'log_zero_guard_value'] if 'log_zero_guard_value' in params else 1e-05 if isinstance(self.log_zero_guard_value, str): if self.log_zero_guard_value == "tiny": self.log_zero_guard_value = torch.finfo(torch.float32).tiny elif self.log_zero_guard_value == "eps": self.log_zero_guard_value = torch.finfo(torch.float32).eps else: raise ValueError( f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter." f"It must be either a number, 'tiny', or 'eps'") self.mag_power = params['mag_power'] if 'mag_power' in params else 2 if self.mag_power != 1.0 and self.mag_power != 2.0: raise ValueError( f"{self} received {self.mag_power} for the mag_power parameter." f" It must be either 1.0 or 2.0.") self.pad_to = params['pad_to'] if 'pad_to' in params else 16 self.pad_value = params[ 'pad_value'] if 'pad_value' in params else 0.0 with self.pipe: audio, transcript = dali.fn.nemo_asr_reader( name="Reader", manifest_filepaths=manifest_filepath.split(','), dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate), min_duration=min_duration, max_duration=max_duration, read_sample_rate=False, read_text=True, random_shuffle=shuffle, shard_id=self.shard_id, num_shards=self.num_shards, pad_last_batch=True, ) transcript_len = dali.fn.shapes( dali.fn.reshape(transcript, shape=[-1])) transcript = dali.fn.pad(transcript) # Extract nonsilent region, if necessary if trim: # Need to extract non-silent region before moving to the GPU roi_start, roi_len = dali.fn.nonsilent_region(audio, cutoff_db=-60) audio = audio.gpu() if self.device == 'gpu' else audio audio = dali.fn.slice(audio, roi_start, roi_len, normalized_anchor=False, normalized_shape=False, axes=[0]) else: audio = audio.gpu() if self.device == 'gpu' else audio if not has_preprocessor: # No preprocessing, the output is the audio signal audio = dali.fn.pad(audio) audio_len = dali.fn.shapes(dali.fn.reshape(audio, shape=[-1])) self.pipe.set_outputs(audio, audio_len, transcript, transcript_len) else: # Additive gaussian noise (dither) if self.dither > 0.0: gaussian_noise = dali.fn.normal_distribution( device=self.device) audio = audio + self.dither * gaussian_noise # Preemphasis filter if self.preemph > 0.0: audio = dali.fn.preemphasis_filter( audio, preemph_coeff=self.preemph) # Power spectrogram spec = dali.fn.spectrogram(audio, nfft=self.n_fft, window_length=self.window_size, window_step=self.window_stride) if feature_type == 'mel_spectrogram' or feature_type == 'mfcc': # Spectrogram to Mel Spectrogram spec = dali.fn.mel_filter_bank( spec, sample_rate=self.sample_rate, nfilter=self.n_mels, normalize=True, freq_low=self.freq_low, freq_high=self.freq_high, ) # Mel Spectrogram to MFCC if feature_type == 'mfcc': spec = dali.fn.mfcc(spec, n_mfcc=self.n_mfcc) # Logarithm if self.log_zero_guard_type == 'add': spec = spec + self.log_zero_guard_value spec = dali.fn.to_decibels(spec, multiplier=math.log(10), reference=1.0, cutoff_db=math.log( self.log_zero_guard_value)) # Normalization spec = dali.fn.normalize(spec, axes=self.normalization_axes) # Extracting the length of the spectrogram shape_start = dali.types.Constant(np.array([1], dtype=np.float32), device='cpu') shape_len = dali.types.Constant(np.array([1], dtype=np.float32), device='cpu') spec_len = dali.fn.slice( dali.fn.shapes(spec), shape_start, shape_len, normalized_anchor=False, normalized_shape=False, axes=(0, ), ) # Pads feature dimension to be a multiple of `pad_to` and the temporal dimension to be as big as the largest sample (shape -1) spec = dali.fn.pad(spec, fill_value=self.pad_value, axes=(0, 1), align=(self.pad_to, 1), shape=(1, -1)) self.pipe.set_outputs(spec, spec_len, transcript, transcript_len) # Building DALI pipeline self.pipe.build() if has_preprocessor: output_names = [ 'processed_signal', 'processed_signal_len', 'transcript_raw', 'transcript_raw_len' ] else: output_names = [ 'audio', 'audio_len', 'transcript_raw', 'transcript_raw_len' ] last_batch_policy = LastBatchPolicy.DROP if drop_last else LastBatchPolicy.PARTIAL self._iter = DALIPytorchIterator( [self.pipe], output_map=output_names, reader_name="Reader", last_batch_policy=last_batch_policy, dynamic_shape=True, auto_reset=True, ) # TODO come up with a better solution class DummyDataset: def __init__(self, parent): self.parent = parent def __len__(self): return self.parent.size self.dataset = DummyDataset(self) # Used by NeMo def reset(self): self._iter.reset() def __iter__(self): return self def next(self): return self.__next__() @property def size(self): return self._iter.size def __len__(self): return len(self._iter) def __next__(self): outputs = self._iter.next() assert len(outputs) == 1 out = outputs[0] text_raw_len = out['transcript_raw_len'].numpy() text_raw = out['transcript_raw'].numpy() text_tokens = [] text_tokens_len = [] max_len = 0 batch_size = text_raw.shape[0] for i, text in enumerate(text_raw): n = text_raw_len[i][0] tbytes = str(text[:n].tobytes(), encoding='utf8') ttokens = self.parser(tbytes) if self.bos_id is not None: ttokens = [self.bos_id] + ttokens if self.eos_id is not None: ttokens = ttokens + [self.eos_id] ttokens_len = len(ttokens) text_tokens_len.append(ttokens_len) text_tokens.append(ttokens) if ttokens_len > max_len: max_len = ttokens_len transcript_out = torch.zeros(batch_size, max_len, dtype=torch.long) for i, n in enumerate(text_tokens_len): transcript_out[i, :n] = torch.tensor(text_tokens[i], dtype=torch.long) transcript_len_out = torch.tensor(text_tokens_len, dtype=torch.long) out['transcript'] = transcript_out out['transcript_len'] = transcript_len_out return DALIOutputs(out)
def train(self): self.model.train() batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') meter_loss = AverageMeter('Loss', ':.4e') meter_loss_constr = AverageMeter('Constr', ':6.2f') meter_loss_perp = AverageMeter('Perplexity', ':6.2f') progress = ProgressMeter( self.training_loader.epoch_size()['__Video_0'], [ batch_time, data_time, meter_loss, meter_loss_constr, meter_loss_perp ], prefix="Steps: [{}]".format(self.num_steps)) data_iter = DALIGenericIterator(self.training_loader, ['data'], auto_reset=True) end = time.time() for i in range(self.start_steps, self.num_steps): # measure output loading time data_time.update(time.time() - end) try: images = next(data_iter)[0]['data'] except StopIteration: data_iter.reset() images = next(data_iter)[0]['data'] images = images.to('cuda') b, d, _, _, c = images.size() images = rearrange(images, 'b d h w c -> (b d) c h w') images = self.normalize(images.float() / 255.) images = rearrange(images, '(b d) c h w -> b (d c) h w', b=b, d=d, c=c) self.optimizer.zero_grad() vq_loss, images_recon, perplexity = self.model(images) recon_error = F.mse_loss(images_recon, images) loss = recon_error + vq_loss loss.backward() self.optimizer.step() meter_loss_constr.update(recon_error.item(), 1) meter_loss_perp.update(perplexity.item(), 1) meter_loss.update(loss.item(), 1) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % 20 == 0: progress.display(i) if i % 1000 == 0: print('saving ...') save_checkpoint( self.folder_name, { 'steps': i, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict() }, 'checkpoint%s.pth.tar' % i) self.scheduler.step() images, images_recon = map( lambda t: rearrange( t, 'b (d c) h w -> b d c h w', b=b, d=d, c=c), [images, images_recon]) images_orig, images_recs = train_visualize( unnormalize=self.unnormalize, images=images[0, :self.n_images_save], n_images=self.n_images_save, image_recs=images_recon[0, :self.n_images_save]) save_images(file_name=os.path.join(self.path_img_orig, f'image_{i}.png'), image=images_orig) save_images(file_name=os.path.join(self.path_img_recs, f'image_{i}.png'), image=images_recs) if self.run_wandb: logs = { 'iter': i, 'loss_recs': meter_loss_constr.val, 'loss': meter_loss.val, 'lr': self.scheduler.get_last_lr()[0] } self.run_wandb.log(logs) print('saving ...') save_checkpoint( self.folder_name, { 'steps': self.num_steps, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), }, 'checkpoint%s.pth.tar' % self.num_steps)
errG = GAN_factor * errG_GAN + L1_factor * errG_L1 + L2_factor * errG_L2 loss_L1 += errG_L1.item() loss_L2 += errG_L2.item() loss_gan += errG_GAN.item() errG.backward() # Update G optimizerG.step() if epoch == 0: print('First training epoch completed in ', (time.time() - start_time), ' seconds') # reset the DALI iterator train_pipe_loader.reset() # Print the absolute values of three losses to screen: print('[%d/30] Training absolute losses: L1 %.7f ; L2 %.7f BCE %.7f' % ( (epoch + 1), loss_L1 / m_train, loss_L2 / m_train, loss_gan / m_train, )) # Save the inputs, outputs, and ground truth frontals to files: vutils.save_image(profile.data, 'output/%03d_input.jpg' % epoch, normalize=True) vutils.save_image(real.data, 'output/%03d_real.jpg' % epoch,
class _AudioTextDALIDataset(Iterator): """ NVIDIA DALI pipeline that loads tensors via one or more manifest files where each line containing a sample descriptor in JSON, including audio files, transcripts, and durations (in seconds). Here's an example: {"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147} ... {"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt": "utterance_id", "ctm_utt": "en_4156", "side": "A"} Args: manifest_filepath: Path to manifest file with the format described above. Can be comma-separated paths. device (str): Determines the device type to be used for preprocessing. Allowed values are: 'cpu', 'gpu'. batch_size (int): Number of samples in a batch. parser (str, callable): A str for an inbuilt parser, or a callable with signature f(str) -> List[int]. sample_rate (int): Sample rate to resample loaded audio to. num_threads (int): Number of CPU processing threads to be created by the DALI pipeline. max_duration (float): Determines the maximum allowed duration, in seconds, of the loaded audio files. min_duration (float): Determines the minimum allowed duration, in seconds, of the loaded audio files. bos_id (int): Id of beginning of sequence symbol to append if not None eos_id (int): Id of end of sequence symbol to append if not None pad_id (int): Id used to pad the input. Defaults to 0 if not provided. trim (bool): If True, it will extract the nonsilent region of the loaded audio signal. shuffle (bool): If set to True, the dataset will shuffled after loading. drop_last (bool): If set to True, the last batch will be dropped if incomplete. This will be the case when the shard size is not divisible by the batch size. If set to False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. device_id (int): Index of the GPU to be used (local_rank). Only applicable when device == 'gpu'. Defaults to 0. global_rank (int): Worker rank, used for partitioning shards. Defaults to 0. world_size (int): Total number of processes, used for partitioning shards. Defaults to 1. preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor. return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet). """ def __init__( self, manifest_filepath: str, device: str, batch_size: int, parser: Union[str, Callable], audio_tar_filepaths: Optional[Union[str, List[str]]] = None, audio_tar_index_filepaths: Optional[Union[str, List[str]]] = None, sample_rate: int = 16000, num_threads: int = 4, max_duration: float = 0.0, min_duration: float = 0.0, bos_id: Optional[int] = None, eos_id: Optional[int] = None, pad_id: int = 0, trim: bool = False, shuffle: bool = False, drop_last: bool = False, shard_strategy: str = "scatter", device_id: int = 0, global_rank: int = 0, world_size: int = 1, preprocessor_cfg: DictConfig = None, return_sample_id: bool = False, ): self.drop_last = drop_last # used by lr_scheduler if return_sample_id: raise ValueError( "Currently DALI data layers don't support returning the sample_id and return_sample_id can not be enabled." ) self.return_sample_id = return_sample_id if not HAVE_DALI: raise ModuleNotFoundError( f"{self} requires NVIDIA DALI to be installed. " f"See: https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#id1" ) if device not in ('cpu', 'gpu'): raise ValueError( f"{self} received an unexpected device argument {device}. Supported values are: 'cpu', 'gpu'" ) device_id = device_id if device == 'gpu' else None self.batch_size = batch_size # Used by NeMo self.device = device self.device_id = device_id if world_size > 1: self.shard_id = global_rank self.num_shards = world_size else: self.shard_id = None self.num_shards = None self.eos_id = eos_id self.bos_id = bos_id self.sample_rate = sample_rate self.pipe = Pipeline( batch_size=batch_size, num_threads=num_threads, device_id=self.device_id, exec_async=True, exec_pipelined=True, ) has_preprocessor = preprocessor_cfg is not None if has_preprocessor: if preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor": feature_type = "mel_spectrogram" elif preprocessor_cfg._target_ == "nemo.collections.asr.modules.AudioToMFCCPreprocessor": feature_type = "mfcc" else: raise ValueError( f"{self} received an unexpected preprocessor configuration: {preprocessor_cfg._target_}." f" Supported preprocessors are: AudioToMelSpectrogramPreprocessor, AudioToMFCCPreprocessor" ) # Default values taken from AudioToMelSpectrogramPreprocessor params = preprocessor_cfg self.dither = params['dither'] if 'dither' in params else 0.0 self.preemph = params['preemph'] if 'preemph' in params else 0.97 self.window_size_sec = params[ 'window_size'] if 'window_size' in params else 0.02 self.window_stride_sec = params[ 'window_stride'] if 'window_stride' in params else 0.01 self.sample_rate = params[ 'sample_rate'] if 'sample_rate' in params else sample_rate self.window_size = int(self.window_size_sec * self.sample_rate) self.window_stride = int(self.window_stride_sec * self.sample_rate) normalize = params[ 'normalize'] if 'normalize' in params else 'per_feature' if normalize == 'per_feature': # Each freq channel independently self.normalization_axes = (1, ) elif normalize == 'all_features': self.normalization_axes = (0, 1) else: raise ValueError( f"{self} received {normalize} for the normalize parameter." f" It must be either 'per_feature' or 'all_features'.") self.window = None window_name = params['window'] if 'window' in params else 'hann' torch_windows = { 'hann': torch.hann_window, 'hamming': torch.hamming_window, 'blackman': torch.blackman_window, 'bartlett': torch.bartlett_window, 'none': None, } if window_name == 'ones': window_tensor = torch.ones(self.window_size) else: try: window_fn = torch_windows.get(window_name, None) except: raise ValueError( f"{self} received '{window_name}' for the window parameter." f" It must be one of: ('hann', 'ones', 'hamming', 'blackman', 'bartlett', None)." f" None is equivalent to 'hann'.") window_tensor = window_fn( self.window_size, periodic=False) if window_fn else None self.window = window_tensor.numpy().tolist( ) if window_tensor is not None else None self.n_fft = params['n_fft'] if 'n_fft' in params else 2**math.ceil( math.log2(self.window_size)) self.n_mels = params['n_mels'] if 'n_mels' in params else 64 self.n_mfcc = params['n_mfcc'] if 'n_mfcc' in params else 64 features = params['features'] if 'features' in params else 0 if features > 0: if feature_type == 'mel_spectrogram': self.n_mels = features elif feature_type == 'mfcc': self.n_mfcc = features # TODO Implement frame splicing if 'frame_splicing' in params: assert params[ 'frame_splicing'] == 1, "Frame splicing is not implemented" self.freq_low = params['lowfreq'] if 'lowfreq' in params else 0.0 self.freq_high = params[ 'highfreq'] if 'highfreq' in params else self.sample_rate / 2.0 self.log_features = params['log'] if 'log' in params else True # We want to avoid taking the log of zero # There are two options: either adding or clamping to a small value self.log_zero_guard_type = params[ 'log_zero_guard_type'] if 'log_zero_guard_type' in params else 'add' if self.log_zero_guard_type not in ["add", "clamp"]: raise ValueError( f"{self} received {self.log_zero_guard_type} for the " f"log_zero_guard_type parameter. It must be either 'add' or " f"'clamp'.") self.log_zero_guard_value = (params['log_zero_guard_value'] if 'log_zero_guard_value' in params else 2**-24) if isinstance(self.log_zero_guard_value, str): if self.log_zero_guard_value == "tiny": self.log_zero_guard_value = torch.finfo(torch.float32).tiny elif self.log_zero_guard_value == "eps": self.log_zero_guard_value = torch.finfo(torch.float32).eps else: raise ValueError( f"{self} received {self.log_zero_guard_value} for the log_zero_guard_type parameter." f"It must be either a number, 'tiny', or 'eps'") self.mag_power = params['mag_power'] if 'mag_power' in params else 2 if self.mag_power != 1.0 and self.mag_power != 2.0: raise ValueError( f"{self} received {self.mag_power} for the mag_power parameter." f" It must be either 1.0 or 2.0.") self.pad_to = max(params['pad_to'], 1) if 'pad_to' in params else 16 self.pad_value = params[ 'pad_value'] if 'pad_value' in params else 0.0 with self.pipe: if audio_tar_filepaths is None and audio_tar_index_filepaths is None: audio, indices = dali.fn.readers.nemo_asr( name="Reader", manifest_filepaths=manifest_filepath.split(','), dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate), min_duration=min_duration, max_duration=max_duration, read_sample_rate=False, read_text=False, read_idxs=True, random_shuffle=shuffle, shard_id=self.shard_id, num_shards=self.num_shards, pad_last_batch=True, ) self.is_tarred_dataset = False elif audio_tar_filepaths is not None and audio_tar_index_filepaths is not None: audio_tar_filepaths = expand_audio_filepaths( audio_tar_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank) audio_tar_index_filepaths = expand_audio_filepaths( audio_tar_index_filepaths, shard_strategy=shard_strategy, world_size=world_size, global_rank=global_rank, ) if len(audio_tar_filepaths) != len( audio_tar_index_filepaths) and len( audio_tar_index_filepaths) != 0: raise ValueError( f"Number of filepaths provided for `audio_tar_filepaths` must match " f"`audio_tar_index_filepaths`. Got {len(audio_tar_filepaths)} audio_tar_filepaths and " f"{len(audio_tar_index_filepaths)} audio_tar_index_filepaths." ) tar_file = dali.fn.readers.webdataset( paths=audio_tar_filepaths, index_paths=audio_tar_index_filepaths, name="Reader", ext=["wav"], missing_component_behavior="error", random_shuffle=shuffle, shard_id=self.shard_id, num_shards=self.num_shards, pad_last_batch=True, ) audio, _ = dali.fn.decoders.audio( tar_file, dtype=dali.types.FLOAT, downmix=True, sample_rate=float(self.sample_rate), ) indices = dali.fn.get_property(tar_file, key="source_info") indices = dali.fn.pad(indices) self.is_tarred_dataset = True else: raise RuntimeError( "When using DALI datasets, either `audio_tar_filepaths` " "and `audio_tar_index_filepaths` should either both be None (sequential dataset)" "or provided (tarred dataset).") # Extract nonsilent region, if necessary if trim: # Need to extract non-silent region before moving to the GPU roi_start, roi_len = dali.fn.nonsilent_region(audio, cutoff_db=-60) audio = audio.gpu() if self.device == 'gpu' else audio audio = dali.fn.slice(audio, roi_start, roi_len, normalized_anchor=False, normalized_shape=False, axes=[0]) else: audio = audio.gpu() if self.device == 'gpu' else audio if not has_preprocessor: # No preprocessing, the output is the audio signal audio_len = dali.fn.shapes(dali.fn.reshape(audio, shape=[-1])) audio = dali.fn.pad(audio) self.pipe.set_outputs(audio, audio_len, indices) else: # Additive gaussian noise (dither) if self.dither > 0.0: gaussian_noise = dali.fn.random.normal(audio) audio = audio + self.dither * gaussian_noise # Preemphasis filter if self.preemph > 0.0: audio = dali.fn.preemphasis_filter( audio, preemph_coeff=self.preemph, border='zero') # Power spectrogram spec = dali.fn.spectrogram( audio, nfft=self.n_fft, window_length=self.window_size, window_step=self.window_stride, window_fn=self.window, ) if feature_type == 'mel_spectrogram' or feature_type == 'mfcc': # Spectrogram to Mel Spectrogram spec = dali.fn.mel_filter_bank( spec, sample_rate=self.sample_rate, nfilter=self.n_mels, normalize=True, freq_low=self.freq_low, freq_high=self.freq_high, ) # Mel Spectrogram to MFCC if feature_type == 'mfcc': spec = dali.fn.mfcc(spec, n_mfcc=self.n_mfcc) # Logarithm if self.log_zero_guard_type == 'add': spec = spec + self.log_zero_guard_value spec = dali.fn.to_decibels(spec, multiplier=math.log(10), reference=1.0, cutoff_db=math.log( self.log_zero_guard_value)) # Normalization spec = dali.fn.normalize(spec, axes=self.normalization_axes, epsilon=1e-5**2, ddof=1) # Extracting the length of the spectrogram spec_len = dali.fn.slice(dali.fn.shapes(spec), 1, 1, axes=(0, )) # Pads feature dimension to be a multiple of `pad_to` and the temporal dimension to be as big as the largest sample (shape -1) spec = dali.fn.pad(spec, fill_value=self.pad_value, axes=(0, 1), align=(self.pad_to, 1), shape=(1, -1)) self.pipe.set_outputs(spec, spec_len, indices) x = time.time() # Building DALI pipeline self.pipe.build() y = time.time() logging.info(f"Time for pipe.build() : {(y - x)} seconds") if has_preprocessor: output_names = [ 'processed_signal', 'processed_signal_len', 'manifest_indices' ] else: output_names = ['audio', 'audio_len', 'manifest_indices'] x = time.time() last_batch_policy = LastBatchPolicy.DROP if drop_last else LastBatchPolicy.PARTIAL self._iter = DALIPytorchIterator( [self.pipe], output_map=output_names, reader_name="Reader", last_batch_policy=last_batch_policy, dynamic_shape=True, auto_reset=True, ) y = time.time() logging.info( f"Time for DALIPytorchIterator to initialize : {(y - x)} seconds") # TODO come up with a better solution class DummyDataset: def __init__(self, parent): self.parent = parent def __len__(self): return self.parent.size self.dataset = DummyDataset(self) # Used by NeMo x = time.time() self.manifest_processor = ASRManifestProcessor( manifest_filepath=manifest_filepath, parser=parser, max_duration=max_duration, min_duration=min_duration, max_utts=0, bos_id=bos_id, eos_id=eos_id, pad_id=pad_id, index_by_file_id=self.is_tarred_dataset, ) y = time.time() logging.info( f"Time to build nemo manifest processor - {(y - x)} seconds") def reset(self): self._iter.reset() def __iter__(self): return self def next(self): return self.__next__() @property def size(self): return self._iter.size def __len__(self): return len(self._iter) def __next__(self): outputs = self._iter.next() assert len(outputs) == 1 dali_out = outputs[0] manifest_indices = dali_out['manifest_indices'].numpy() out = {} out_names = [ 'processed_signal', 'processed_signal_len', 'audio', 'audio_len' ] for out_name in out_names: if out_name in dali_out: out[out_name] = dali_out[out_name].detach().clone() text_tokens = [] text_tokens_len = [] max_len = 0 batch_size = manifest_indices.shape[0] for i, manifest_index in enumerate(manifest_indices): if not self.is_tarred_dataset: # Loose-file dataset. Index is integer based. manifest_index = manifest_index[0] text, text_length = self.manifest_processor.process_text_by_id( manifest_index) else: # Tarred-file dataset. Index is filename based. resolved_manifest_indices = manifest_index.tobytes().decode( ).split(":") resolved_manifest_index = resolved_manifest_indices[ 2] # we require just the filename segment resolved_manifest_index = os.path.splitext( resolved_manifest_index)[0] # we dont need file extension text, text_length = self.manifest_processor.process_text_by_file_id( resolved_manifest_index) text_tokens_len.append(text_length) text_tokens.append(text) if text_length > max_len: max_len = text_length transcript_out = torch.full([batch_size, max_len], fill_value=self.manifest_processor.pad_id, dtype=torch.long) for i, n in enumerate(text_tokens_len): transcript_out[i, :n] = torch.tensor(text_tokens[i], dtype=torch.long) transcript_len_out = torch.tensor(text_tokens_len, dtype=torch.long) out['transcript'] = transcript_out out['transcript_len'] = transcript_len_out return DALIOutputs(out)
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available if not torch.cuda.is_available(): print("Error. No GPU available.") return False dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) train_pipe = COCOPipeline(args.batch_size, train_coco_root, train_annotate, dboxes, args.seed) train_pipe.build() train_loader = DALIGenericIterator(train_pipe, ["images", "boxes", "labels"], train_pipe.epoch_size("Reader")) mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mean, std = generate_mean_std() data_perf = AverageMeter() batch_perf = AverageMeter() end = time.time() train_start = end mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, data in enumerate(train_loader): img = data[0]["images"] bbox = data[0]["boxes"] label = data[0]["labels"] boxes_in_batch = len(label.nonzero()) if boxes_in_batch == 0: print("No labels in batch") continue label = label.type(torch.cuda.LongTensor) img = Variable(img, requires_grad=True) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) data_perf.update(time.time() - end) if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ploc, plabel = ssd300(img) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() batch_perf.update(time.time() - end) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) try: if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True except: print("Eval error on iteration {0}".format(iter_num)) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Avg Data perf: {:3f} img/sec, Avg Batch perf: {:3f} img/sec"\ .format(iter_num, loss.item(), avg_loss, args.batch_size / data_perf.val, args.batch_size / batch_perf.val, args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg), end="\r") end = time.time() iter_num += 1 if iter_num == 10 and epoch == 0: data_perf.reset() batch_perf.reset() train_loader.reset() print("\n\n") print("Training end: Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Total time: {:3f} sec"\ .format(args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg, time.time() - train_start)) return False
seq_len=len(mp4_ims), step=1) else: pipe1 = OFPipeline(batch_size=1, num_threads=4, device_id=gpu_id, data=tf.name, seq_len=3, step=2) pipe1.build() dali_loader = DALIGenericIterator(pipe1, ["rgb", "optic_flow", "labels"], pipe1.epoch_size()['Reader'], fill_last_batch=True, last_batch_padded=True) dali_loader.reset() # Reset the loader if load_full: for out in dali_loader: print('loaded %s' % mp4_fn) else: inputs_list_of = [] inputs_list_rgb = [] for out in dali_loader: curr_rgb = out[0]['rgb'][:, 1:, :, :, :].clone().detach() curr_of = out[0]['optic_flow'][:, :, :, :, :].clone().detach() inputs_list_rgb.append(curr_rgb) # Middle frame inputs_list_of.append(curr_of) # Last OF frame of the 2 if load_full: flow_out = 1 - flow2rgb_torch(out[0]['optic_flow'].cpu())
type=int, help='epochs (default: 3)') args = parser.parse_args() csvii = CSVInputIterator(args.batch_size, args.images_folder, args.mos_file, shuffle=False) pipe = ExternalSourcePipeline(batch_size=args.batch_size, num_threads=2, device_id=0, external_data=csvii) pii = DALIGenericIterator(pipe, output_map=['data', 'label1', 'label2'], size=csvii.size, last_batch_padded=True, fill_last_batch=False) if not os.path.exists('res'): os.mkdir('res') for e in range(args.epochs): for i, data in enumerate(pii): print("epoch: {}, iter {}, real batch size: {}".format( e, i, len(data[0]["data"]))) show_images( data[0], args.batch_size, ) plt.savefig('res/csv_epoch{}_iter{}.jpg'.format(e, i)) pii.reset()
def run(model, net_size, root_dir, save_dir, input_size, batch_size, learning_rate, min_lr, epochs, device, patience): model = model(input_ch=3, output_ch=3, net_size=net_size) model, epoch = load_model(model, save_dir) # Load Datasets elements = ['im1', 'im2', 'im3', 'targets'] train_iter = ExternalInputIterator(batch_size=batch_size, data_dir=os.path.join( root_dir, 'train')) train_pipe = ExternalSourcePipeline(data_iterator=iter(train_iter), batch_size=batch_size, num_threads=8, device_id=0, size=input_size) train_pipe.build() train_dali_iter = DALIGenericIterator([train_pipe], elements, train_iter.n) valid_iter = ExternalInputIterator(batch_size=1, data_dir=os.path.join( root_dir, 'valid')) valid_pipe = ExternalSourcePipeline(data_iterator=iter(valid_iter), batch_size=1, num_threads=4, device_id=0, size=input_size, eval_enabled=True) valid_pipe.build() valid_dali_iter = DALIGenericIterator([valid_pipe], elements, valid_iter.n) test_iter = ExternalInputIterator(batch_size=1, data_dir=os.path.join(root_dir, 'test')) test_pipe = ExternalSourcePipeline(data_iterator=iter(test_iter), batch_size=1, num_threads=1, device_id=0, size=input_size, eval_enabled=True) test_pipe.build() test_dali_iter = DALIGenericIterator([test_pipe], elements, 1) # Build the Network model = model.to(device) global_step = epoch * train_iter.n optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=patience) criterion = torch.nn.MSELoss() # Logging Settings logger = setup_logger(os.path.join(save_dir, "model_run.log")) writer = SummaryWriter(log_dir=save_dir, purge_step=global_step) info = f''' Starting training: Epochs: {epochs} Batch size: {batch_size} Learning Rate: {learning_rate} Minimum LR: {min_lr} Patience: {patience} Training size: {train_iter.n} Validation size: {valid_iter.n} Save Directory: {save_dir} Model Directory: {root_dir} Device: {device.type} Network Size: {net_size} ''' logger.info(info) print(info) # Run the Model for epoch in range(epoch, epochs): model.train() train_loss = AverageMeter() with tqdm(total=train_iter.n, desc=f'Epoch {epoch + 1}/{epochs}', unit='img') as pbar: for _, it in enumerate(train_dali_iter): batch_data = it[0] im1, im2, im3 = batch_data["im1"], batch_data[ "im2"], batch_data["im3"] targets = batch_data["targets"] output = model(im1, im2, im3) loss = criterion(output, targets) train_loss.update(loss.item(), im1.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar('Training Loss', loss.item(), global_step) pbar.set_postfix(**{'loss_l2': train_loss.avg}) pbar.update(im1.shape[0]) if (global_step % 256) == 0: writer.add_images('training_inputs', im2[:1, :, :, :], global_step) writer.add_images('training_output', output[:1, :, :, :], global_step) writer.add_images('training_target', targets[:1, :, :, :], global_step) i, o = test_net(model, test_dali_iter) test_dali_iter.reset() writer.add_images('test_input', i[:1, :, :, :], global_step) writer.add_images('test_output', o[:1, :, :, :], global_step) global_step += 1 val_loss = eval_net(model, valid_dali_iter, device, valid_iter.n, writer, global_step) scheduler.step(val_loss) if get_lr(optimizer) <= min_lr: logger.info('Minimum Learning Rate Reached: Early Stopping') break writer.add_scalar('Validation Loss', val_loss, global_step) writer.add_scalar('Learning Rate', get_lr(optimizer), global_step) torch.save( model.state_dict(), os.path.join(save_dir, "model_save_epoch_{}.pth".format(epoch))) logger.info('Checkpoint {} saved!'.format(epoch)) logger.info('Validation Loss L2: {}'.format(val_loss)) logger.info('Learning Rate: {}'.format(get_lr(optimizer))) train_dali_iter.reset() valid_dali_iter.reset() writer.close() logger.info('Training finished, exiting...') torch.save(model.state_dict(), os.path.join(save_dir, "model_save_epoch_{}.pth".format(epoch))) logger.info('Final checkpoint {} saved!'.format(epoch)) del model