def evaluate(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True print('The image is {:}'.format(args.image)) print('The model is {:}'.format(args.model)) snapshot = Path(args.model) assert snapshot.exists(), 'The model path {:} does not exist' facebox=face_detect(args.image,args.face_detector) print('The face bounding box is {:}'.format(facebox)) assert len(facebox)==4,'Invalid face input : {:}'.format(facebox) snapshot = torch.load(str(snapshot)) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) param = snapshot['args'] eval_transform = transforms.Compose( [transforms.PreCrop(param.pre_crop_expand), transforms.TrainScale2WH((param.crop_width, param.crop_height)), transforms.ToTensor(), normalize]) model_config = load_configure(param.model_config, None) dataset = GeneralDataset(eval_transform, param.sigma, model_config.downsample, param.heatmap_type, param.data_indicator) dataset.reset(param.num_pts) net = obtain_model(model_config, param.num_pts + 1) net = net.cuda() weights = remove_module_dict(snapshot['state_dict']) net.load_state_dict(weights) print('Prepare input data') [image, _, _, _, _, _, cropped_size], meta = dataset.prepare_input(args.image, facebox) inputs = image.unsqueeze(0).cuda() # network forward with torch.no_grad(): batch_heatmaps, batch_locs, batch_scos = net(inputs) # obtain the locations on the image in the orignial size cpu = torch.device('cpu') np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(cpu).numpy(), batch_scos.to( cpu).numpy(), cropped_size.numpy() locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(np_batch_scos[0, :-1], -1) scale_h, scale_w = cropped_size[0] * 1. / inputs.size(-2), cropped_size[1] * 1. / inputs.size(-1) locations[:, 0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[2], locations[:, 1] * scale_h + \ cropped_size[3] prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0) print('the coordinates for {:} facial landmarks:'.format(param.num_pts)) for i in range(param.num_pts): point = prediction[:, i] print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.format(i+1, param.num_pts, float(point[0]), float(point[1]), float(point[2]))) image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0), facebox, None,None) image.show() image.save(args.image.split('.')[0]+'_result.jpg')
def build_transforms(config): transform_train = T.Compose([ T.RandomCroping(config.DATA.HEIGHT, config.DATA.WIDTH, p=config.AUG.RC_PROB), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), T.RandomErasing(probability=config.AUG.RE_PROB) ]) transform_test = T.Compose([ T.Resize((config.DATA.HEIGHT, config.DATA.WIDTH)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) return transform_train, transform_test
def Generate_transform_Dict(origin_width=256, width=227, ratio=0.16, rot=0, args=None): std_value = 1.0 / 255.0 if (args is not None) and ("ResNet" in args.net): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) cc = [] else: normalize = transforms.Normalize( mean=[104 / 255.0, 117 / 255.0, 128 / 255.0], std=[1.0 / 255, 1.0 / 255, 1.0 / 255]) print("bgr init") cc = [transforms.CovertBGR()] transform_dict = {} transform_dict['rand-crop'] = \ transforms.Compose(cc + [transforms.Resize((origin_width)), transforms.RandomResizedCrop(scale=(ratio, 1), size=width), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_dict['center-crop'] = \ transforms.Compose(cc + [ transforms.Resize((origin_width)), transforms.CenterCrop(width), transforms.ToTensor(), normalize, ]) transform_dict['resize'] = \ transforms.Compose(cc + [ transforms.Resize((width)), transforms.ToTensor(), normalize, ]) return transform_dict
def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, cache_dir=None #added ): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) #self.file_reader = AsyncFileReader() self.t = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(self.t), n_fft=fft_size, hop_length=fft_hop, #file_reader=AsyncFileReader() )
def __init__( self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000, center=True ): self.sp = signal.signal_proc() self.hop = hop self.center = center self.filename = file_name self.sequence_len = sequence_len self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[1] spec_t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=self.center), ] self.spec_transforms = T.Compose(spec_t) if freq_compression == "linear": self.t_compr_f = (T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t_compr_f = (T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t_compr_f = (T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"] )
def __init__(self, file_name, sequence_len: int, hop: int, sr: int = 44100, fft_size: int = 4096, fft_hop: int = 441, n_freq_bins: int = 256, freq_compression: str = "linear", f_min: int = 200, f_max: int = 18000): self.sequence_len = sequence_len self.hop = hop self.audio = T.load_audio_file(file_name, sr=sr, mono=True) self.n_frames = self.audio.shape[ 1] # total num of samples in the audio (transposed mono) self.t = [ T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(fft_size, fft_hop, center=False), ] if freq_compression == "linear": self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max)) elif freq_compression == "mel": self.t.append( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) elif freq_compression == "mfcc": t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) self.t.append(T.Compose(t_mel, T.M2MFCC())) else: raise "Undefined frequency compression" self.t.append( T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])) self.t.append( T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )) self.t = T.Compose(self.t)
def get_transform(train, dataset_name): base_size = cfg.DATA_TRANSFORM.LOADSIZE crop_size = cfg.DATA_TRANSFORM.CROPSIZE ignore_label = cfg.DATASET.IGNORE_LABEL if dataset_name == cfg.DATASET.SOURCE: input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_S else: input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_T min_size = int((1.0 if train else 1.0) * base_size) max_size = int((1.3 if train else 1.0) * base_size) transforms = [] if cfg.DATA_TRANSFORM.RANDOM_RESIZE_AND_CROP: if train: transforms.append(T.RandomResize(min_size, max_size)) transforms.append(T.RandomHorizontalFlip(0.5)) transforms.append( T.RandomCrop(crop_size, ignore_label=ignore_label)) else: transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True)) else: if train: transforms.append(T.Resize(input_size)) transforms.append(T.RandomHorizontalFlip(0.5)) else: transforms.append(T.Resize(input_size, True)) mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET) transforms.append(T.LabelRemap(mapping[dataset_name])) transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE)) if cfg.DATASET.IMG_MODE == "BGR": mean = (104.00698793, 116.66876762, 122.67891434) std = (1.0, 1.0, 1.0) else: mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) transforms.append(T.Normalize(mean, std)) return T.Compose(transforms)
def get_transform(dataset_name): base_size = cfg.DATA_TRANSFORM.LOADSIZE ignore_label = cfg.DATASET.IGNORE_LABEL min_size = base_size max_size = base_size transforms = [] transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True)) mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET) transforms.append(T.LabelRemap(mapping[dataset_name])) transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE)) if cfg.DATASET.IMG_MODE == "BGR": mean = (104.00698793, 116.66876762, 122.67891434) std = (1.0, 1.0, 1.0) else: mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) transforms.append(T.Normalize(mean, std)) return T.Compose(transforms)
def main(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True prepare_seed(args.rand_seed) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python version : {}".format(sys.version.replace('\n', ' '))) logger.log("Pillow version : {}".format(PIL.__version__)) logger.log("PyTorch version : {}".format(torch.__version__)) logger.log("cuDNN version : {}".format(torch.backends.cudnn.version())) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max) train_transform = [transforms.PreCrop(args.pre_crop_expand)] train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))] train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)] # if args.arg_flip: # train_transform += [transforms.AugHorizontalFlip()] if args.rotate_max: train_transform += [transforms.AugRotate(args.rotate_max)] train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)] train_transform += [transforms.ToTensor(), normalize] train_transform = transforms.Compose(train_transform) eval_transform = transforms.Compose( [transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)), transforms.ToTensor(), normalize]) assert (args.scale_min + args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format( args.scale_min, args.scale_max, args.scale_eval) # Model Configure Load model_config = load_configure(args.model_config, logger) args.sigma = args.sigma * args.scale_eval logger.log('Real Sigma : {:}'.format(args.sigma)) # Training Dataset train_data = GeneralDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) train_data.load_list(args.train_lists, args.num_pts, True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True,num_workers=args.workers, pin_memory=True) # Evaluation Dataloader eval_loaders = [] if args.eval_ilists is not None: for eval_ilist in args.eval_ilists: eval_idata = GeneralDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_idata.load_list(eval_ilist, args.num_pts, True) eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_iloader, False)) # Define network logger.log('configure : {:}'.format(model_config)) net = obtain_model(model_config, args.num_pts + 1) assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format( model_config.downsample, net.downsample) logger.log("=> network :\n {}".format(net)) logger.log('Training-data : {:}'.format(train_data)) for i, eval_loader in enumerate(eval_loaders): eval_loader, is_video = eval_loader logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders), 'video' if is_video else 'image', eval_loader.dataset)) logger.log('arguments : {:}'.format(args)) opt_config = load_configure(args.opt_config, logger) if hasattr(net, 'specify_parameter'): net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay) else: net_param_dict = net.parameters() optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger) logger.log('criterion : {:}'.format(criterion)) net, criterion = net.cuda(), criterion.cuda() net = torch.nn.DataParallel(net) last_info = logger.last_info() if last_info.exists(): logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(str(last_info)) start_epoch = last_info['epoch'] + 1 checkpoint = torch.load(last_info['last_checkpoint']) assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info, checkpoint[ 'epoch']) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format(logger.last_info(), checkpoint['epoch'])) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch = 0 if args.eval_once: logger.log("=> only evaluate the model once") eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config) logger.close() return # Main Training and Evaluation Loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, opt_config.epochs): scheduler.step() need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs - epoch), True) epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs) LRs = scheduler.get_lr() logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str, need_time, min(LRs), max(LRs), opt_config)) # train for one epoch train_loss, train_nme = train(args, train_loader, net, criterion, optimizer, epoch_str, logger, opt_config) # log the results logger.log( '==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss, train_nme * 100)) # remember best prec@1 and save checkpoint save_path = save_checkpoint({ 'epoch': epoch, 'args': deepcopy(args), 'arch': model_config.arch, 'state_dict': net.state_dict(), 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, str(logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str)), logger) last_info = save_checkpoint({ 'epoch': epoch, 'last_checkpoint': save_path, }, str(logger.last_info()), logger) eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.close()
def __init__(self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files=[], min_max_normalize=False, *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True) else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) if min_max_normalize: self.t_norm = T.MinMaxNormalize() self._logger.debug("Init min-max-normalization activated") else: self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self._logger.debug("Init 0/1-dB-normalization activated") self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
parser.add_argument('--resume', type=str, default=None, help='put the path to resuming file if needed') args = parser.parse_args() args.checkname = args.arc # Define Saver saver = Saver(args) saver.save_experiment_config() # Define Tensorboard Summary summary = TensorboardSummary(saver.experiment_dir) writer = summary.create_summary() # Data normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_trans = transforms.Compose([transforms.Resize(321), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) val_trans = transforms.Compose([transforms.Resize(321), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_ds = VOCSBDClassification('/path/to/VOC', '/path/to/SBD/benchmark_RELEASE/dataset', transform=train_trans, image_set='train') train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=2048, #4096 hop_length=220, #441 freq_compression="linear", n_freq_bins=256, # determines the width of the image f_min=0, f_max=18000, seq_len=128, # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py) augmentation=False, noise_files=[], *args, **kwargs): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.n_fft = n_fft self.hop_length = hop_length self.f_min = f_min self.f_max = f_max # mel: log transformation of freq (Hz scale to Mel scale) # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was # a big problem considering the high-frequency pulsed calls and whistles. valid_freq_compressions = ["linear", "mel", "mfcc"] if freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(freq_compression, valid_freq_compressions), ) self.freq_compression = freq_compression # combine a RegExp pattern into pattern objects for pattern matching self.possible_call_labels = re.compile("|".join(["call"])) self.possible_nocall_labels = re.compile("|".join(["noise"])) self._logger.debug("Number of files : {}".format(len(self.file_names))) _n_calls = 0 for f in self.file_names: if self.is_call(f): _n_calls += 1 self._logger.debug("Number of calls: {}".format(_n_calls)) self._logger.debug( "Number of noise: {}".format(len(self.file_names) - _n_calls)) self.augmentation = augmentation spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), # return: a vector tensor T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] self.file_reader = AsyncFileReader() # if user chooses to not cache .spec by omitting the directory if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: # where .spec is created and stored # n_fft, hop_length: meta in spec_dict self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if augmentation: self._logger.debug( "Init augmentation transforms for time and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: self._logger.debug("Running without augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()) else: raise "Undefined frequency compression" if augmentation: if noise_files: self._logger.debug( "Init augmentation transform for random noise addition") self.t_addnoise = T.RandomAddNoise( noise_files, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, return_original=True ) # if return_original = True, both augmented and original specs are returned else: self.t_addnoise = None self.t_compr_a = T.Amp2Db( min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def main(): print('starting denoising') noise_sigma = 4e-5 # sigma for the noise simulation batch_size = 8 # number of images to run for each minibach num_epochs = 200 # number of epochs to train validation_seed = 15 # rng seed for validation loop log_dir = 'logs/denoise/' # log dir for models and tensorboard device = torch.device('cpu') # model will run on this device dtype = torch.float # dtype for data and model # set up tensorboard writer = SummaryWriter(log_dir=log_dir) # checkpoint file name checkpoint_file = os.path.join(log_dir + 'best_model.pt') # ------------------------------------------------------------------------- # NOISE SIMULATION SETUP transform_list = [ transforms.AddNoise(target_op=False, sigma=noise_sigma), transforms.Ifft(norm='ortho'), transforms.SquareRootSumSquare(), transforms.Normalize(), transforms.ToTensor(dat_complex=False, target_complex=False) ] # ------------------------------------------------------------------------- # DATALOADER SETUP train_dataset = KneeDataSet('pytorch_tutorial_data/', 'train', transform=transforms.Compose(transform_list)) print('data set information:') print(train_dataset) val_dataset = KneeDataSet('pytorch_tutorial_data/', 'val', transform=transforms.Compose(transform_list)) # convert to a PyTorch dataloader # this handles batching, random shuffling, parallelization train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=True, ) display_dat = val_dataset[15]['dat'].unsqueeze(0).to(device=device, dtype=dtype) display_target = val_dataset[15]['target'].unsqueeze(0).to(device=device, dtype=dtype) display_vmax = np.max(np.squeeze(display_dat.cpu().numpy())) # ------------------------------------------------------------------------- # MODEL SETUP model = DenoiseCnn(num_chans=64, num_layers=4, magnitude_input=True, magnitude_output=True) model = model.to(device) model = model.train() print('CNN model information:') print(model) # ------------------------------------------------------------------------- # OPTIMIZER SETUP optimizer = torch.optim.Adam(model.parameters()) loss_fn = torch.nn.MSELoss() # ------------------------------------------------------------------------- # LOAD PREVIOUS STATE start_epoch, model, optimizer, min_val_loss = load_checkpoint( checkpoint_file, model, optimizer) current_seed = 20 # ------------------------------------------------------------------------- # NETWORK TRAINING for epoch_index in range(start_epoch, num_epochs): print('epoch {} of {}'.format(epoch_index + 1, num_epochs)) # --------------------------------------------------------------------- # TRAINING LOOP model = model.train() # rng seed for noise generation torch.manual_seed(current_seed) np.random.seed(current_seed) torch.cuda.manual_seed(current_seed) # batch loop losses = [] for batch in train_loader: target = batch['target'].to(device=device, dtype=dtype) dat = batch['dat'].to(device=device, dtype=dtype) est = model(dat) # forward propagation loss = loss_fn(est, target) # calculate the loss optimizer.zero_grad() # clear out old gradients loss.backward() # back propagation optimizer.step() # update the CNN weights # keep last 10 minibatches to compute training loss losses.append(loss.item()) losses = losses[-10:] print('trailing training loss: {}'.format(np.mean(losses))) # --------------------------------------------------------------------- # EVALUATION LOOP model = model.eval() # rng seed for noise generation current_seed = np.random.get_state()[1][0] torch.manual_seed(validation_seed) np.random.seed(validation_seed) torch.cuda.manual_seed(validation_seed) # batch loop val_losses = [] with torch.no_grad(): for batch in val_loader: target = batch['target'].to(device=device, dtype=dtype) dat = batch['dat'].to(device=device, dtype=dtype) est = model(dat) loss = loss_fn(est, target) val_losses.append(loss.item()) print('validation loss: {}'.format(np.mean(val_losses))) # --------------------------------------------------------------------- # VISUALIZATIONS AND CHECKPOINTS if np.mean(val_losses) < min_val_loss: save_checkpoint(epoch_index, model, optimizer, np.mean(val_losses), checkpoint_file) # write the losses writer.add_scalar('loss/train', np.mean(losses), epoch_index + 1) writer.add_scalar('loss/validation', np.mean(val_losses), epoch_index + 1) # show an example image from the validation data model = model.eval() with torch.no_grad(): display_est = model(display_dat) writer.add_image('validation/dat', display_dat[0] / display_vmax, global_step=epoch_index + 1) writer.add_image('validation/cnn', display_est[0] / display_vmax, global_step=epoch_index + 1) writer.add_image('validation/target', display_target[0] / display_vmax, global_step=epoch_index + 1) writer.close()
saver.save_experiment_config() # Define Tensorboard Summary summary = TensorboardSummary(saver.experiment_dir) args.exp = saver.experiment_dir.split('_')[-1] if args.train_dataset == 'cityscapes': # Data train_trans = transforms.Compose([ transforms.ToPILImage(), # transforms.RandomResizedCrop((args.image_size, args.image_size), scale=(0.2, 2)), transforms.Resize((args.image_size, args.image_size)), transforms.RandomHorizontalFlip(), transforms.RandomAffine(22, scale=(0.75, 1.25)), transforms.ToTensor(), transforms.Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) # transforms.NormalizeInstance() ]) val_trans = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((args.image_size, args.image_size), do_mask=False), transforms.ToTensor(), transforms.Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]) # transforms.NormalizeInstance() ]) if args.ann_type == 'comp': train_ds = CityscapesInstances_comp(args.inst_path, args.ann_train,
def main(): parser = argparse.ArgumentParser( description='Training single stage FPN with OHEM, resnet as backbone') parser.add_argument('DATA_ROOT', help='Location to root directory for dataset reading' ) # /mnt/mars-fast/datasets/ parser.add_argument( 'SAVE_ROOT', help='Location to root directory for saving checkpoint models' ) # /mnt/mars-alpha/ parser.add_argument( 'MODEL_PATH', help= 'Location to root directory where kinetics pretrained models are stored' ) parser.add_argument( '--MODE', default='train', help= 'MODE can be train, gen_dets, eval_frames, eval_tubes define SUBSETS accordingly, build tubes' ) # Name of backbone network, e.g. resnet18, resnet34, resnet50, resnet101 resnet152 are supported parser.add_argument('--ARCH', default='resnet50', type=str, help=' base arch') parser.add_argument('--MODEL_TYPE', default='I3D', type=str, help=' base model') parser.add_argument('--ANCHOR_TYPE', default='RETINA', type=str, help='type of anchors to be used in model') parser.add_argument('--SEQ_LEN', default=8, type=int, help='NUmber of input frames') parser.add_argument('--TEST_SEQ_LEN', default=8, type=int, help='NUmber of input frames') parser.add_argument( '--MIN_SEQ_STEP', default=1, type=int, help='DIFFERENCE of gap between the frames of sequence') parser.add_argument( '--MAX_SEQ_STEP', default=1, type=int, help='DIFFERENCE of gap between the frames of sequence') # if output heads are have shared features or not: 0 is no-shareing else sharining enabled # parser.add_argument('--MULIT_SCALE', default=False, type=str2bool,help='perfrom multiscale training') parser.add_argument('--HEAD_LAYERS', default=3, type=int, help='0 mean no shareding more than 0 means shareing') parser.add_argument('--NUM_FEATURE_MAPS', default=5, type=int, help='0 mean no shareding more than 0 means shareing') parser.add_argument('--CLS_HEAD_TIME_SIZE', default=3, type=int, help='Temporal kernel size of classification head') parser.add_argument('--REG_HEAD_TIME_SIZE', default=3, type=int, help='Temporal kernel size of regression head') # Name of the dataset only voc or coco are supported parser.add_argument('--DATASET', default='road', type=str, help='dataset being used') parser.add_argument('--TRAIN_SUBSETS', default='train_3,', type=str, help='Training SUBSETS seprated by ,') parser.add_argument('--VAL_SUBSETS', default='', type=str, help='Validation SUBSETS seprated by ,') parser.add_argument('--TEST_SUBSETS', default='', type=str, help='Testing SUBSETS seprated by ,') # Input size of image only 600 is supprted at the moment parser.add_argument('--MIN_SIZE', default=512, type=int, help='Input Size for FPN') # data loading argumnets parser.add_argument('-b', '--BATCH_SIZE', default=4, type=int, help='Batch size for training') parser.add_argument('--TEST_BATCH_SIZE', default=1, type=int, help='Batch size for testing') # Number of worker to load data in parllel parser.add_argument('--NUM_WORKERS', '-j', default=8, type=int, help='Number of workers used in dataloading') # optimiser hyperparameters parser.add_argument('--OPTIM', default='SGD', type=str, help='Optimiser type') parser.add_argument('--RESUME', default=0, type=int, help='Resume from given epoch') parser.add_argument('--MAX_EPOCHS', default=30, type=int, help='Number of training epoc') parser.add_argument('-l', '--LR', '--learning-rate', default=0.004225, type=float, help='initial learning rate') parser.add_argument('--MOMENTUM', default=0.9, type=float, help='momentum') parser.add_argument('--MILESTONES', default='20,25', type=str, help='Chnage the lr @') parser.add_argument('--GAMMA', default=0.1, type=float, help='Gamma update for SGD') parser.add_argument('--WEIGHT_DECAY', default=1e-4, type=float, help='Weight decay for SGD') # Freeze layers or not parser.add_argument( '--FBN', '--FREEZE_BN', default=True, type=str2bool, help='freeze bn layers if true or else keep updating bn layers') parser.add_argument( '--FREEZE_UPTO', default=1, type=int, help='layer group number in ResNet up to which needs to be frozen') # Loss function matching threshold parser.add_argument('--POSTIVE_THRESHOLD', default=0.5, type=float, help='Min threshold for Jaccard index for matching') parser.add_argument('--NEGTIVE_THRESHOLD', default=0.4, type=float, help='Max threshold Jaccard index for matching') # Evaluation hyperparameters parser.add_argument( '--EVAL_EPOCHS', default='30', type=str, help= 'eval epochs to test network on these epoch checkpoints usually the last epoch is used' ) parser.add_argument('--VAL_STEP', default=2, type=int, help='Number of training epoch before evaluation') parser.add_argument( '--IOU_THRESH', default=0.5, type=float, help='Evaluation threshold for validation and for frame-wise mAP') parser.add_argument( '--CONF_THRESH', default=0.025, type=float, help='Confidence threshold for to remove detection below given number') parser.add_argument( '--NMS_THRESH', default=0.5, type=float, help='NMS threshold to apply nms at the time of validation') parser.add_argument('--TOPK', default=10, type=int, help='topk detection to keep for evaluation') parser.add_argument( '--GEN_CONF_THRESH', default=0.025, type=float, help='Confidence threshold at the time of generation and dumping') parser.add_argument('--GEN_TOPK', default=100, type=int, help='topk at the time of generation') parser.add_argument('--GEN_NMS', default=0.5, type=float, help='NMS at the time of generation') parser.add_argument('--CLASSWISE_NMS', default=False, type=str2bool, help='apply classwise NMS/no tested properly') parser.add_argument( '--JOINT_4M_MARGINALS', default=False, type=str2bool, help= 'generate score of joints i.e. duplexes or triplet by marginals like agents and actions scores' ) ## paths hyper parameters parser.add_argument( '--COMPUTE_PATHS', default=False, type=str2bool, help=' COMPUTE_PATHS if set true then it overwrite existing ones') parser.add_argument( '--PATHS_IOUTH', default=0.5, type=float, help='Iou threshold for building paths to limit neighborhood search') parser.add_argument( '--PATHS_COST_TYPE', default='score', type=str, help= 'cost function type to use for matching, other options are scoreiou, iou' ) parser.add_argument( '--PATHS_JUMP_GAP', default=4, type=int, help= 'GAP allowed for a tube to be kept alive after no matching detection found' ) parser.add_argument('--PATHS_MIN_LEN', default=6, type=int, help='minimum length of generated path') parser.add_argument( '--PATHS_MINSCORE', default=0.1, type=float, help='minimum score a path should have over its length') ## paths hyper parameters parser.add_argument('--COMPUTE_TUBES', default=False, type=str2bool, help='if set true then it overwrite existing tubes') parser.add_argument('--TUBES_ALPHA', default=0, type=float, help='alpha cost for changeing the label') parser.add_argument('--TRIM_METHOD', default='none', type=str, help='other one is indiv which works for UCF24') parser.add_argument('--TUBES_TOPK', default=10, type=int, help='Number of labels to assign for a tube') parser.add_argument('--TUBES_MINLEN', default=5, type=int, help='minimum length of a tube') parser.add_argument( '--TUBES_EVAL_THRESHS', default='0.2,0.5', type=str, help= 'evaluation threshold for checking tube overlap at evaluation time, one can provide as many as one wants' ) # parser.add_argument('--TRAIL_ID', default=0, # type=int, help='eval TUBES_Thtrshold at evaluation time') ### parser.add_argument('--LOG_START', default=10, type=int, help='start loging after k steps for text/tensorboard') parser.add_argument('--LOG_STEP', default=10, type=int, help='Log every k steps for text/tensorboard') parser.add_argument( '--TENSORBOARD', default=1, type=str2bool, help='Use tensorboard for loss/evalaution visualization') # Program arguments parser.add_argument('--MAN_SEED', default=123, type=int, help='manualseed for reproduction') parser.add_argument( '--MULTI_GPUS', default=True, type=str2bool, help= 'If more than 0 then use all visible GPUs by default only one GPU used ' ) # Use CUDA_VISIBLE_DEVICES=0,1,4,6 to select GPUs to use ## Parse arguments args = parser.parse_args() args = utils.set_args(args) # set directories and SUBSETS fo datasets args.MULTI_GPUS = False if args.BATCH_SIZE == 1 else args.MULTI_GPUS ## set random seeds and global settings np.random.seed(args.MAN_SEED) torch.manual_seed(args.MAN_SEED) # torch.cuda.manual_seed_all(args.MAN_SEED) torch.set_default_tensor_type('torch.FloatTensor') args = utils.create_exp_name(args) utils.setup_logger(args) logger = utils.get_logger(__name__) logger.info(sys.version) assert args.MODE in [ 'train', 'val', 'gen_dets', 'eval_frames', 'eval_tubes' ], 'MODE must be from ' + ','.join(['train', 'test', 'tubes']) if args.MODE == 'train': args.TEST_SEQ_LEN = args.SEQ_LEN else: args.SEQ_LEN = args.TEST_SEQ_LEN if args.MODE in ['train', 'val']: # args.CONF_THRESH = 0.05 args.SUBSETS = args.TRAIN_SUBSETS train_transform = transforms.Compose([ vtf.ResizeClip(args.MIN_SIZE, args.MAX_SIZE), vtf.ToTensorStack(), vtf.Normalize(mean=args.MEANS, std=args.STDS) ]) # train_skip_step = args.SEQ_LEN # if args.SEQ_LEN>4 and args.SEQ_LEN<=10: # train_skip_step = args.SEQ_LEN-2 if args.SEQ_LEN > 10: train_skip_step = args.SEQ_LEN + (args.MAX_SEQ_STEP - 1) * 2 - 2 else: train_skip_step = args.SEQ_LEN train_dataset = VideoDataset(args, train=True, skip_step=train_skip_step, transform=train_transform) logger.info('Done Loading Dataset Train Dataset') ## For validation set full_test = False args.SUBSETS = args.VAL_SUBSETS skip_step = args.SEQ_LEN * 8 else: args.SEQ_LEN = args.TEST_SEQ_LEN args.MAX_SEQ_STEP = 1 args.SUBSETS = args.TEST_SUBSETS full_test = True #args.MODE != 'train' args.skip_beggning = 0 args.skip_ending = 0 if args.MODEL_TYPE == 'I3D': args.skip_beggning = 2 args.skip_ending = 2 elif args.MODEL_TYPE != 'C2D': args.skip_beggning = 2 skip_step = args.SEQ_LEN - args.skip_beggning val_transform = transforms.Compose([ vtf.ResizeClip(args.MIN_SIZE, args.MAX_SIZE), vtf.ToTensorStack(), vtf.Normalize(mean=args.MEANS, std=args.STDS) ]) val_dataset = VideoDataset(args, train=False, transform=val_transform, skip_step=skip_step, full_test=full_test) logger.info('Done Loading Dataset Validation Dataset') args.num_classes = val_dataset.num_classes # one for objectness args.label_types = val_dataset.label_types args.num_label_types = val_dataset.num_label_types args.all_classes = val_dataset.all_classes args.num_classes_list = val_dataset.num_classes_list args.num_ego_classes = val_dataset.num_ego_classes args.ego_classes = val_dataset.ego_classes args.head_size = 256 if args.MODE in ['train', 'val', 'gen_dets']: net = build_retinanet(args).cuda() if args.MULTI_GPUS: logger.info('\nLets do dataparallel\n') net = torch.nn.DataParallel(net) for arg in sorted(vars(args)): logger.info(str(arg) + ': ' + str(getattr(args, arg))) if args.MODE == 'train': if args.FBN: if args.MULTI_GPUS: net.module.backbone.apply(utils.set_bn_eval) else: net.backbone.apply(utils.set_bn_eval) train(args, net, train_dataset, val_dataset) elif args.MODE == 'val': val(args, net, val_dataset) elif args.MODE == 'gen_dets': gen_dets(args, net, val_dataset) eval_framewise_dets(args, val_dataset) build_eval_tubes(args, val_dataset) elif args.MODE == 'eval_frames': eval_framewise_dets(args, val_dataset) elif args.MODE == 'eval_tubes': build_eval_tubes(args, val_dataset)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=4096, hop_length=441, freq_compression="linear", n_freq_bins=256, f_min=0, f_max=18000, seq_len=128, augmentation=False, noise_files_train=[], noise_files_val=[], noise_files_test=[], random=False, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.df = 15.0 self.exp_e = 0.1 self.bin_pow = 2.0 self.gaus_mean = 0.0 self.gaus_stdv = 12.5 self.poisson_lambda = 15.0 self.orig_noise_value = -5 self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.random = random self.hop_length = hop_length self.augmentation = augmentation self.file_reader = AsyncFileReader() self.noise_files_val = noise_files_val self.noise_files_test = noise_files_test self.freq_compression = freq_compression self.noise_files_train = noise_files_train valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compressio, valid_freq_compressions), ) self._logger.debug( "Number of files to denoise : {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False), ] if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader()) if self.augmentation: self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift") self.t_amplitude = T.RandomAmplitude(3, -6) self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() else: #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test self.t_timestretch = T.RandomTimeStretch() self.t_pitchshift = T.RandomPitchSift() self._logger.debug("Running without intensity, time, and pitch augmentation") if self.freq_compression == "linear": self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)) self.t_compr_mfcc = T.M2MFCC(n_mfcc=32) else: raise "Undefined frequency compression" if self.augmentation and self.noise_files_train and self.dataset_name == "train": self._logger.debug("Init training real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_train, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_val and self.dataset_name == "val": self._logger.debug("Init validation real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_val, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) elif not self.augmentation and self.noise_files_test and self.dataset_name == "test": self._logger.debug("Init test real-world noise files for noise2noise adding") self.t_addnoise = T.RandomAddNoise( self.noise_files_test, self.t_spectrogram, T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f), min_length=seq_len, min_snr=-2, max_snr=-8, return_original=True ) else: self.t_addnoise = None raise "ERROR: Init noise files for noise adding does not have a proper setup per split!" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], ) self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
def __init__( self, file_names: Iterable[str], working_dir=None, cache_dir=None, sr=44100, n_fft=1024, hop_length=512, freq_compression="linear", n_freq_bins=256, f_min=None, f_max=18000, *args, **kwargs ): super().__init__(file_names, working_dir, sr, *args, **kwargs) if self.dataset_name is not None: self._logger.info("Init dataset {}...".format(self.dataset_name)) self.sp = signal.signal_proc() self.sr = sr self.f_min = f_min self.f_max = f_max self.n_fft = n_fft self.hop_length = hop_length self.sp = signal.signal_proc() self.freq_compression = freq_compression valid_freq_compressions = ["linear", "mel", "mfcc"] if self.freq_compression not in valid_freq_compressions: raise ValueError( "{} is not a valid freq_compression. Must be one of {}", format(self.freq_compression, valid_freq_compressions), ) self._logger.debug( "Number of test files: {}".format(len(self.file_names)) ) spec_transforms = [ lambda fn: T.load_audio_file(fn, sr=sr), T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]), T.Spectrogram(n_fft, hop_length, center=False) ] self.file_reader = AsyncFileReader() if cache_dir is None: self.t_spectrogram = T.Compose(spec_transforms) else: self.t_spectrogram = T.CachedSpectrogram( cache_dir=cache_dir, spec_transform=T.Compose(spec_transforms), n_fft=n_fft, hop_length=hop_length, file_reader=AsyncFileReader(), ) if self.freq_compression == "linear": self.t_compr_f = T.Interpolate( n_freq_bins, sr, f_min, f_max ) elif self.freq_compression == "mel": self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max) elif self.freq_compression == "mfcc": self.t_compr_f = T.Compose( T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC() ) else: raise "Undefined frequency compression" self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]) self.t_norm = T.Normalize( min_level_db=DefaultSpecDatasetOps["min_level_db"], ref_level_db=DefaultSpecDatasetOps["ref_level_db"], )