def test_aum_update(aum_data): inputs, outputs = aum_data aum_calculator = AUMCalculator(save_dir=None) expected_results = aum_calculator.update(inputs[0]['logits'], inputs[0]['targets'], inputs[0]['sample_ids']) assert expected_results == outputs[0] expected_results = aum_calculator.update(inputs[1]['logits'], inputs[1]['targets'], inputs[1]['sample_ids']) assert expected_results == outputs[1]
def test_aum_finalize(tmp_path, aum_data): inputs, outputs = aum_data save_dir = tmp_path.as_posix() aum_calculator = AUMCalculator(save_dir=save_dir, compressed=False) for data in inputs: aum_calculator.update(data['logits'], data['targets'], data['sample_ids']) aum_calculator.finalize() final_vals = pd.read_csv(os.path.join(save_dir, 'aum_values.csv')) detailed_vals = pd.read_csv(os.path.join(save_dir, 'full_aum_records.csv')) # Lets first verify detailed vals records = [] for output in outputs: records.extend(output.values()) expected_detailed_vals = pd.DataFrame([ asdict(record) for record in records ]).sort_values(by=['sample_id', 'num_measurements']).reset_index(drop=True) assert detailed_vals.equals(expected_detailed_vals) # Now lets verfiy the final vals final_dict = {record.sample_id: record.aum for record in records} expected_final_vals = [] for key, val in final_dict.items(): expected_final_vals.append({'sample_id': key, 'aum': val}) expected_final_vals = pd.DataFrame(expected_final_vals).sort_values( by='aum', ascending=False).reset_index(drop=True) assert final_vals.equals(expected_final_vals)
def train(self, num_epochs=300, batch_size=256, test_at_end=True, lr=0.1, wd=1e-4, momentum=0.9, lr_drops=[0.5, 0.75], aum_wtr=False, rand_weight=False, **kwargs): """ Training script :param int num_epochs: (default 300) :param int batch_size: (default 256) :param float lr: Learning rate :param float wd: Weight decay :param float momentum: Momentum :param list lr_drops: When to drop the learning rate (by a factor of 10) as a percentage of total training time. :param str aum_wtr: (optional) The path of the model/results directory to load AUM_WTR weights from. :param bool rand_weight (optional, default false): uses rectified normal random weighting if True. """ # Model model = self.model if torch.cuda.is_available(): model = model.cuda() if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model).cuda() # Optimizer optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum, nesterov=True) milestones = [ int(lr_drop * num_epochs) for lr_drop in (lr_drops or []) ] scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) logging.info(f"\nOPTIMIZER:\n{optimizer}") logging.info(f"SCHEDULER:\n{scheduler.milestones}") # Initialize AUM caluclator object aum_calculator = AUMCalculator(save_dir=self.savedir, compressed=False) train_data = OrderedDict() train_data["train_indices"] = self.train_set.indices train_data["valid_indices"] = (self.valid_set.indices if self.valid_set is not None else torch.tensor([], dtype=torch.long)) train_data["true_targets"] = self.train_set.targets train_data["assigned_targets"] = self.train_set.assigned_targets # Storage to log results results = [] # Train model best_error = 1 for epoch in range(num_epochs): train_results = self.train_epoch(model=model, optimizer=optimizer, epoch=epoch, num_epochs=num_epochs, batch_size=batch_size, aum_calculator=aum_calculator, aum_wtr=aum_wtr, rand_weight=rand_weight, **kwargs) if self.valid_set is not None: valid_results = self.test(model=model, split="valid", batch_size=batch_size, epoch=epoch, **kwargs) else: valid_results = self.test(model, split="test", batch_size=batch_size, epoch=epoch, **kwargs) scheduler.step() # Determine if model is the best if self.valid_set is not None: self.save() elif best_error > valid_results.error: best_error = valid_results.error logging.info('New best error: %.4f' % valid_results.error) self.save() # Log results logging.info(f"\nTraining {repr(train_results)}") logging.info(f"\nValidation {repr(valid_results)}") logging.info('') results.append( OrderedDict([("epoch", f"{epoch + 1:03d}"), *[(f"train_{field}", val) for field, val in train_results.items()], *[(f"valid_{field}", val) for field, val in valid_results.items()]])) pd.DataFrame(results).set_index("epoch").to_csv( os.path.join(self.savedir, "train_log.csv")) # Save metadata around train set (like which labels were flipped) torch.save(train_data, os.path.join(self.savedir, "train_data.pth")) # Once we're finished training calculate aum aum_calculator.finalize() # Maybe test (last epoch) if test_at_end and self.valid_set is not None: test_results = self.test(model=model, **kwargs) logging.info(f"\nTest (no early stopping) {repr(test_results)}") shutil.copyfile( os.path.join(self.savedir, "results_test.csv"), os.path.join(self.savedir, "results_test_noearlystop.csv")) results.append( OrderedDict([(f"test_{field}", val) for field, val in test_results.items()])) pd.DataFrame(results).set_index("epoch").to_csv( os.path.join(self.savedir, "train_log.csv")) # Load best model self.save(suffix=".last") self.load() # Maybe test (best epoch) if test_at_end and self.valid_set is not None: test_results = self.test(model=model, **kwargs) logging.info(f"\nEarly Stopped Model Test {repr(test_results)}") results.append( OrderedDict([(f"test_best_{field}", val) for field, val in test_results.items()])) pd.DataFrame(results).set_index("epoch").to_csv( os.path.join(self.savedir, "train_log.csv")) return self
def main(args): pprint(vars(args)) # Setup experiment folder structure # Create output folder if it doesn't exist Path(args.output_dir).mkdir(parents=True, exist_ok=True) # save out args with open(os.path.join(args.output_dir, 'args.txt'), 'w+') as f: pprint(vars(args), f) # Setup summary writer summary_writer = SummaryWriter( log_dir=os.path.join(args.output_dir, 'tb_logs')) # Set seeds set_seed(42) # Load dataset # Data transforms mean = [0.5071, 0.4867, 0.4408] stdv = [0.2675, 0.2565, 0.2761] train_transforms = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=mean, std=stdv), ]) test_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=mean, std=stdv), ]) # Datasets train_set = datasets.CIFAR100(args.data_dir, train=True, transform=train_transforms, download=True) val_set = datasets.CIFAR100(args.data_dir, train=True, transform=test_transforms) test_set = datasets.CIFAR100(args.data_dir, train=False, transform=test_transforms) indices = torch.randperm(len(train_set)) train_indices = indices[:len(indices) - args.valid_size] valid_indices = indices[len(indices) - args.valid_size:] train_set = torch.utils.data.Subset(train_set, train_indices) val_set = torch.utils.data.Subset(val_set, valid_indices) train_set = DatasetWithIndex(train_set) val_set = DatasetWithIndex(val_set) test_set = DatasetWithIndex(test_set) val_loader = DataLoader(val_set, batch_size=args.val_batch_size, shuffle=False, pin_memory=(torch.cuda.is_available())) test_loader = DataLoader(test_set, batch_size=args.val_batch_size, shuffle=False, pin_memory=(torch.cuda.is_available())) # Load Model device = 'cuda' if torch.cuda.is_available() else 'cpu' model = resnet34(num_classes=100) model = model.to(device) num_params = sum(x.numel() for x in model.parameters() if x.requires_grad) print(model) f'Number of parameters: {num_params}' # Create optimizer & lr scheduler parameters = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(parameters, lr=args.learning_rate, momentum=args.momentum, nesterov=True) milestones = [0.5 * args.num_epochs, 0.75 * args.num_epochs] scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) # Keep track of AUM aum_calculator = AUMCalculator(args.output_dir, compressed=(not args.detailed_aum)) # Keep track of things global_step = 0 best_error = math.inf print('Beginning training') for epoch in range(args.num_epochs): train_loader = DataLoader(train_set, batch_size=args.train_batch_size, shuffle=True, pin_memory=(torch.cuda.is_available()), num_workers=0) train_metrics = { 'loss': AverageMeter(), 'error': AverageMeter(), 'batch_time': AverageMeter() } num_batches = len(train_loader) for batch_step, batch in enumerate(train_loader): train_step(args, summary_writer, train_metrics, aum_calculator, args.log_interval, batch_step, num_batches, batch, epoch, args.num_epochs, global_step, model, optimizer, device) global_step += 1 scheduler.step() val_metrics = { 'loss': AverageMeter(), 'error': AverageMeter(), 'batch_time': AverageMeter() } num_batches = len(val_loader) for batch_step, batch in enumerate(val_loader): eval_step(args, 'VAL', val_metrics, args.log_interval, batch_step, num_batches, batch, epoch, args.num_epochs, model, device) # log eval metrics to tensorboard summary_writer.add_scalar('val/error', val_metrics['error'].avg, global_step) summary_writer.add_scalar('val/loss', val_metrics['loss'].avg, global_step) summary_writer.add_scalar('val/batch_time', val_metrics['batch_time'].avg, global_step) # Save best model if val_metrics['error'].avg < best_error: best_error = val_metrics['error'].avg torch.save(model.state_dict(), os.path.join(args.output_dir, 'best.pt')) # Finalize aum calculator aum_calculator.finalize() # Eval best model on on test set model.load_state_dict(torch.load(os.path.join(args.output_dir, 'best.pt'))) test_metrics = { 'loss': AverageMeter(), 'error': AverageMeter(), 'batch_time': AverageMeter() } num_batches = len(test_loader) for batch_step, batch in enumerate(test_loader): eval_step(args, 'TEST', test_metrics, args.log_interval, batch_step, num_batches, batch, -1, -1, model, device) # log eval metrics to tensorboard summary_writer.add_scalar('test/error', test_metrics['error'].avg, global_step) summary_writer.add_scalar('test/loss', test_metrics['loss'].avg, global_step) summary_writer.add_scalar('test/batch_time', test_metrics['batch_time'].avg, global_step) # log test metrics to console results = '\t'.join([ 'FINAL TEST RESULTS', f'Loss: {test_metrics["loss"].avg:.3f}', f'Error: {test_metrics["error"].avg:.3f}', ]) print(results)