def main(run_id, pretrained, data_files, model_params, training_params, device): best_acc1 = 0 batch_size = training_params['batch_size'] test_batch_size = training_params['test_batch_size'] epochs = training_params['epochs'] start_epoch = training_params['start_epoch'] n_warmup_steps = training_params['n_warmup_steps'] log_interval = training_params['log_interval'] # model is trained for binary classification (for datalaoder) if model_params['NUM_SPOOF_CLASS'] == 2: binary_class = True else: binary_class = False kwargs = { 'num_workers': 2, 'pin_memory': True } if device == torch.device('cuda') else {} # create model model = Detector(**model_params).to(device) num_model_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('===> Model total parameter: {}'.format(num_model_params)) # Wrap model for multi-GPUs, if necessary if device == torch.device('cuda') and torch.cuda.device_count() > 1: print('multi-gpu') model = nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer optim = optimizer.ScheduledOptim( torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09, weight_decay=1e-4, lr=3e-4, amsgrad=True), training_params['n_warmup_steps']) # optionally resume from a checkpoint if pretrained: if os.path.isfile(pretrained): print("===> loading checkpoint '{}'".format(pretrained)) checkpoint = torch.load(pretrained) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optim.load_state_dict(checkpoint['optimizer']) print("===> loaded checkpoint '{}' (epoch {})".format( pretrained, checkpoint['epoch'])) else: print("===> no checkpoint found at '{}'".format(pretrained)) # Data loading code train_data = SpoofDatsetSystemID(data_files['train_scp'], data_files['train_utt2index'], binary_class) val_data = SpoofDatsetSystemID(data_files['dev_scp'], data_files['dev_utt2index'], binary_class) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(val_data, batch_size=test_batch_size, shuffle=True, **kwargs) best_epoch = 0 early_stopping, max_patience = 0, 100 # for early stopping os.makedirs("model_snapshots/" + run_id, exist_ok=True) for epoch in range(start_epoch, start_epoch + epochs): trainer.train(train_loader, model, optim, epoch, device, log_interval) acc1 = validate.validate(val_loader, data_files['dev_utt2systemID'], model, device, log_interval) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # adjust learning rate + early stopping if is_best: early_stopping = 0 best_epoch = epoch + 1 else: early_stopping += 1 if epoch - best_epoch > 2: optim.increase_delta() best_epoch = epoch + 1 if early_stopping == max_patience: break # save model optimizer.save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optim.state_dict(), }, is_best, "model_snapshots/" + str(run_id), str(epoch) + ('_%.3f' % acc1) + ".pth.tar")
def train(output_model_dir: str, input_model_path: Optional[str] = None, tb_path: str = None, nuscenes_version: str = 'v1.0-mini', data_path: str = "data/v1.0-mini", n_scenes: int = None, learning_rate: int = 1e-4, n_dumps_per_epoch: int = 10, n_loader_workers: int = 4, batch_size: int = 12, n_epochs: int = 50, device_id: List[int] = None) -> None: """ Train model, log training statistics if tb_path is specified. :param output_model_dir: path to directory to save model weights to :param input_model_path: path to model weights. If None, create new model :param tb_path: name of the folder for tensorboard data to be store in :param nuscenes_version: version of the dataset :param data_path: relative path to data folder :param n_scenes: number of scenes in dataset :param learning_rate: learning rate for Adam :param n_dumps_per_epoch: how many times per epoch to dump images to tensorboard (not implemented yet) :param n_loader_workers: number of CPU workers for data loader processing :param batch_size: batch size :param n_epochs: total number of epochs to train the model :param device_id: list of gpu device ids to use, e.g [0, 1] """ # create path for model save os.makedirs(output_model_dir, exist_ok=True) # set up computing device for pytorch if torch.cuda.is_available(): if device_id is None: device_id = [0] if max(device_id) < torch.cuda.device_count(): # device_id/s all exist on machine, # device is set as a root device device = torch.device(f'cuda:{device_id[0]}') else: # device_id is out of range, setting to defaults cuda:0 print('Warning: specified number of gpu device_id is larger than available, using cuda:0.') device = torch.device('cuda:0') print('Using device: GPU\n') else: device = torch.device('cpu') print('Using device: CPU\n') date = datetime.datetime.now().strftime('%b-%d-%Y-%H:%M:%S') # set up tensorboard writer if tb_path is not None: train_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/train') val_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/val') print(f'Logging tensorboard data to directory: {tb_path}/{date}\n') else: train_writer, val_writer = None, None print(f'No tensorboard logging will be performed\n') # set up dataset and model nuscenes = create_nuscenes(data_path, nuscenes_version) train_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='train') val_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='val') train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers, collate_fn=frames_bboxes_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers, collate_fn=frames_bboxes_collate_fn, pin_memory=True) print('Loaders are ready.', f'Number of batches in train loader: {len(train_loader)}' f'Number of batches in validation loader: {len(val_loader)}', sep='\n') frame_depth, frame_width, frame_length = train_dataset.grid_size model = Detector(img_depth=frame_depth) if input_model_path is not None: model.load_state_dict(torch.load(input_model_path, map_location="cpu")) model = model.to(device) criterion = DetectionLoss() optimizer = Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, gamma=0.5, step_size=50) # TODO: adjust step_size empirically detector_out_shape = (batch_size, model.out_channels, frame_width // (2 ** model.n_pools), frame_length // (2 ** model.n_pools)) gt_former = GroundTruthFormer((frame_width, frame_length), detector_out_shape, device=device) if len(device_id) > 1 and max(device_id) < torch.cuda.device_count(): # if more than one device_id specified, use DataParallel model = nn.DataParallel(model, device_ids=device_id) model = model.to(device) best_val_score = float('-inf') for epoch in trange(n_epochs, desc="Epoch"): run_epoch(model, train_loader, criterion, gt_former, epoch, mode='train', writer=train_writer, optimizer=optimizer, device=device) scheduler.step() val_loss, val_score = run_epoch(model, val_loader, criterion, gt_former, epoch, mode='val', train_loader_size=len(train_loader), writer=val_writer, device=device) # saving model weights in case validation loss AND score are better if val_score > best_val_score: best_val_score = val_score torch.save(model.state_dict(), f'{output_model_dir}/{date}.pth') print('\nModel checkpoint is saved.', f'loss: {val_loss:.3f}, score: {val_score:.3f}', sep='\n')
def main(pretrained, data_files, model_params, training_params, device): """ forward pass dev and eval data to trained model """ batch_size = training_params['batch_size'] test_batch_size = training_params['test_batch_size'] epochs = training_params['epochs'] start_epoch = training_params['start_epoch'] n_warmup_steps = training_params['n_warmup_steps'] log_interval = training_params['log_interval'] kwargs = { 'num_workers': 4, 'pin_memory': True } if device == torch.device('cuda') else {} # create model model = Detector(**model_params).to(device) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('===> Model total parameter: {}'.format(num_params)) if device == torch.device('cuda') and torch.cuda.device_count() > 1: print('multi-gpu') model = nn.DataParallel(model).cuda() if pretrained: epoch_id = pretrained.split('/')[2].split('_')[0] pretrained_id = pretrained.split('/')[1] if os.path.isfile(pretrained): print("===> loading checkpoint '{}'".format(pretrained)) checkpoint = torch.load( pretrained, map_location=lambda storage, loc: storage) # load for cpu best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) print("===> loaded checkpoint '{}' (epoch {})".format( pretrained, checkpoint['epoch'])) else: print("===> no checkpoint found at '{}'".format(pretrained)) exit() else: raise NameError # Data loading code (class analysis for multi-class classification only) val_data = SpoofDatsetSystemID(data_files['dev_scp'], data_files['dev_utt2index'], binary_class=False) eval_data = SpoofDatsetSystemID(data_files['eval_scp'], data_files['eval_utt2index'], binary_class=False) val_loader = torch.utils.data.DataLoader(val_data, batch_size=test_batch_size, shuffle=False, **kwargs) eval_loader = torch.utils.data.DataLoader(eval_data, batch_size=test_batch_size, shuffle=False, **kwargs) os.makedirs(data_files['scoring_dir'], exist_ok=True) # forward pass for dev print("===> forward pass for dev set") score_file_pth = os.path.join( data_files['scoring_dir'], str(pretrained_id) + '-epoch%s-dev_scores.txt' % (epoch_id)) print("===> dev scoring file saved at: '{}'".format(score_file_pth)) prediction.prediction(val_loader, model, device, score_file_pth, data_files['dev_utt2systemID']) # forward pass for eval print("===> forward pass for eval set") score_file_pth = os.path.join( data_files['scoring_dir'], str(pretrained_id) + '-epoch%s-eval_scores.txt' % (epoch_id)) print("===> eval scoring file saved at: '{}'".format(score_file_pth)) prediction.prediction(eval_loader, model, device, score_file_pth, data_files['eval_utt2systemID'])
print('Skipping:', args.run_dir) sys.exit(0) ckpt_dir = os.path.join(args.run_dir, 'ckpt') if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) params_file = os.path.join(args.run_dir, 'params.csv') params.to_csv(params_file, index=False) with pd.option_context('display.width', None, 'max_columns', None): print(params) log_file = os.path.join(args.run_dir, 'log.csv') log = pd.DataFrame() optimizer = Adam(detector.parameters(), lr=args.lr, weight_decay=args.weight_decay) weights = train_data.weights sampler = WeightedRandomSampler(weights, len(weights)) # Precompute Embeddings for train and val set once print('Precomputing embedded features: TRAIN') train_cache = 'cache/cache_train_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc', 'med' if 'medoids' in args.centroids else 'centr') train_data = precompute_embeddings(features_state, train_data, model, args, cache=train_cache) print('Precomputing embedded features: VAL') val_cache = 'cache/cache_val_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc', 'med' if 'medoids' in args.centroids else 'centr') val_paths, val_data = precompute_embeddings(features_state, val_data, model, args, return_paths=True, cache=val_cache) train_loader = DataLoader(train_data, sampler=sampler, pin_memory=True, batch_size=args.batch_size)
args.image_size, args.file_loc, args.shuffle, fold=0, do_transform=False) val_loader = DataLoader(val_dataset, num_workers=0, shuffle=True, batch_size=args.batch_size, pin_memory=False, drop_last=True) model = Detector(args.dropout_rate).cuda() set_requires_grad([model.feature_extractor], False) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=args.min_lr, last_epoch=-1) bce_loss = torch.nn.BCEWithLogitsLoss() epoch = 0 step = 0 writer = SummaryWriter(args.logging_path) if args.resume_from_last: args.checkpoint_path = get_last_checkpoint_filename(args.logging_path) if args.checkpoint_path != "": epoch, step = load_checkpoint(model, optimizer, args.checkpoint_path)