Exemple #1
0
def test_aum_finalize(tmp_path, aum_data):
    inputs, outputs = aum_data
    save_dir = tmp_path.as_posix()
    aum_calculator = AUMCalculator(save_dir=save_dir, compressed=False)

    for data in inputs:
        aum_calculator.update(data['logits'], data['targets'],
                              data['sample_ids'])

    aum_calculator.finalize()
    final_vals = pd.read_csv(os.path.join(save_dir, 'aum_values.csv'))
    detailed_vals = pd.read_csv(os.path.join(save_dir, 'full_aum_records.csv'))

    # Lets first verify detailed vals
    records = []
    for output in outputs:
        records.extend(output.values())

    expected_detailed_vals = pd.DataFrame([
        asdict(record) for record in records
    ]).sort_values(by=['sample_id', 'num_measurements']).reset_index(drop=True)
    assert detailed_vals.equals(expected_detailed_vals)

    # Now lets verfiy the final vals
    final_dict = {record.sample_id: record.aum for record in records}
    expected_final_vals = []
    for key, val in final_dict.items():
        expected_final_vals.append({'sample_id': key, 'aum': val})
    expected_final_vals = pd.DataFrame(expected_final_vals).sort_values(
        by='aum', ascending=False).reset_index(drop=True)

    assert final_vals.equals(expected_final_vals)
Exemple #2
0
    def train(self,
              num_epochs=300,
              batch_size=256,
              test_at_end=True,
              lr=0.1,
              wd=1e-4,
              momentum=0.9,
              lr_drops=[0.5, 0.75],
              aum_wtr=False,
              rand_weight=False,
              **kwargs):
        """
        Training script

        :param int num_epochs: (default 300)
        :param int batch_size: (default 256)
        :param float lr: Learning rate
        :param float wd: Weight decay
        :param float momentum: Momentum
        :param list lr_drops: When to drop the learning rate (by a factor of 10) as a percentage of total training time.

        :param str aum_wtr: (optional) The path of the model/results directory to load AUM_WTR weights from.
        :param bool rand_weight (optional, default false): uses rectified normal random weighting if True.
        """
        # Model
        model = self.model
        if torch.cuda.is_available():
            model = model.cuda()
            if torch.cuda.device_count() > 1:
                model = torch.nn.DataParallel(model).cuda()

        # Optimizer
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=lr,
                                    weight_decay=wd,
                                    momentum=momentum,
                                    nesterov=True)
        milestones = [
            int(lr_drop * num_epochs) for lr_drop in (lr_drops or [])
        ]
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=milestones,
                                                         gamma=0.1)
        logging.info(f"\nOPTIMIZER:\n{optimizer}")
        logging.info(f"SCHEDULER:\n{scheduler.milestones}")

        # Initialize AUM caluclator object
        aum_calculator = AUMCalculator(save_dir=self.savedir, compressed=False)

        train_data = OrderedDict()
        train_data["train_indices"] = self.train_set.indices
        train_data["valid_indices"] = (self.valid_set.indices
                                       if self.valid_set is not None else
                                       torch.tensor([], dtype=torch.long))
        train_data["true_targets"] = self.train_set.targets
        train_data["assigned_targets"] = self.train_set.assigned_targets

        # Storage to log results
        results = []

        # Train model
        best_error = 1
        for epoch in range(num_epochs):
            train_results = self.train_epoch(model=model,
                                             optimizer=optimizer,
                                             epoch=epoch,
                                             num_epochs=num_epochs,
                                             batch_size=batch_size,
                                             aum_calculator=aum_calculator,
                                             aum_wtr=aum_wtr,
                                             rand_weight=rand_weight,
                                             **kwargs)
            if self.valid_set is not None:
                valid_results = self.test(model=model,
                                          split="valid",
                                          batch_size=batch_size,
                                          epoch=epoch,
                                          **kwargs)
            else:
                valid_results = self.test(model,
                                          split="test",
                                          batch_size=batch_size,
                                          epoch=epoch,
                                          **kwargs)
            scheduler.step()

            # Determine if model is the best
            if self.valid_set is not None:
                self.save()
            elif best_error > valid_results.error:
                best_error = valid_results.error
                logging.info('New best error: %.4f' % valid_results.error)
                self.save()

            # Log results
            logging.info(f"\nTraining {repr(train_results)}")
            logging.info(f"\nValidation {repr(valid_results)}")
            logging.info('')
            results.append(
                OrderedDict([("epoch", f"{epoch + 1:03d}"),
                             *[(f"train_{field}", val)
                               for field, val in train_results.items()],
                             *[(f"valid_{field}", val)
                               for field, val in valid_results.items()]]))
            pd.DataFrame(results).set_index("epoch").to_csv(
                os.path.join(self.savedir, "train_log.csv"))

            # Save metadata around train set (like which labels were flipped)
            torch.save(train_data, os.path.join(self.savedir,
                                                "train_data.pth"))

        # Once we're finished training calculate aum
        aum_calculator.finalize()

        # Maybe test (last epoch)
        if test_at_end and self.valid_set is not None:
            test_results = self.test(model=model, **kwargs)
            logging.info(f"\nTest (no early stopping) {repr(test_results)}")
            shutil.copyfile(
                os.path.join(self.savedir, "results_test.csv"),
                os.path.join(self.savedir, "results_test_noearlystop.csv"))
            results.append(
                OrderedDict([(f"test_{field}", val)
                             for field, val in test_results.items()]))
            pd.DataFrame(results).set_index("epoch").to_csv(
                os.path.join(self.savedir, "train_log.csv"))

        # Load best model
        self.save(suffix=".last")
        self.load()

        # Maybe test (best epoch)
        if test_at_end and self.valid_set is not None:
            test_results = self.test(model=model, **kwargs)
            logging.info(f"\nEarly Stopped Model Test {repr(test_results)}")
            results.append(
                OrderedDict([(f"test_best_{field}", val)
                             for field, val in test_results.items()]))
        pd.DataFrame(results).set_index("epoch").to_csv(
            os.path.join(self.savedir, "train_log.csv"))

        return self
Exemple #3
0
def main(args):
    pprint(vars(args))

    # Setup experiment folder structure
    # Create output folder if it doesn't exist
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)

    # save out args
    with open(os.path.join(args.output_dir, 'args.txt'), 'w+') as f:
        pprint(vars(args), f)

    # Setup summary writer
    summary_writer = SummaryWriter(
        log_dir=os.path.join(args.output_dir, 'tb_logs'))

    # Set seeds
    set_seed(42)

    # Load dataset
    # Data transforms
    mean = [0.5071, 0.4867, 0.4408]
    stdv = [0.2675, 0.2565, 0.2761]
    train_transforms = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])
    test_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])

    # Datasets
    train_set = datasets.CIFAR100(args.data_dir,
                                  train=True,
                                  transform=train_transforms,
                                  download=True)
    val_set = datasets.CIFAR100(args.data_dir,
                                train=True,
                                transform=test_transforms)
    test_set = datasets.CIFAR100(args.data_dir,
                                 train=False,
                                 transform=test_transforms)

    indices = torch.randperm(len(train_set))
    train_indices = indices[:len(indices) - args.valid_size]
    valid_indices = indices[len(indices) - args.valid_size:]
    train_set = torch.utils.data.Subset(train_set, train_indices)
    val_set = torch.utils.data.Subset(val_set, valid_indices)

    train_set = DatasetWithIndex(train_set)
    val_set = DatasetWithIndex(val_set)
    test_set = DatasetWithIndex(test_set)

    val_loader = DataLoader(val_set,
                            batch_size=args.val_batch_size,
                            shuffle=False,
                            pin_memory=(torch.cuda.is_available()))
    test_loader = DataLoader(test_set,
                             batch_size=args.val_batch_size,
                             shuffle=False,
                             pin_memory=(torch.cuda.is_available()))

    # Load Model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = resnet34(num_classes=100)
    model = model.to(device)
    num_params = sum(x.numel() for x in model.parameters() if x.requires_grad)
    print(model)
    f'Number of parameters: {num_params}'

    # Create optimizer & lr scheduler
    parameters = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(parameters,
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                nesterov=True)
    milestones = [0.5 * args.num_epochs, 0.75 * args.num_epochs]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=milestones,
                                                     gamma=0.1)

    # Keep track of AUM
    aum_calculator = AUMCalculator(args.output_dir,
                                   compressed=(not args.detailed_aum))

    # Keep track of things
    global_step = 0
    best_error = math.inf

    print('Beginning training')
    for epoch in range(args.num_epochs):

        train_loader = DataLoader(train_set,
                                  batch_size=args.train_batch_size,
                                  shuffle=True,
                                  pin_memory=(torch.cuda.is_available()),
                                  num_workers=0)

        train_metrics = {
            'loss': AverageMeter(),
            'error': AverageMeter(),
            'batch_time': AverageMeter()
        }
        num_batches = len(train_loader)
        for batch_step, batch in enumerate(train_loader):
            train_step(args, summary_writer, train_metrics, aum_calculator,
                       args.log_interval, batch_step, num_batches, batch,
                       epoch, args.num_epochs, global_step, model, optimizer,
                       device)

            global_step += 1

        scheduler.step()

        val_metrics = {
            'loss': AverageMeter(),
            'error': AverageMeter(),
            'batch_time': AverageMeter()
        }
        num_batches = len(val_loader)
        for batch_step, batch in enumerate(val_loader):
            eval_step(args, 'VAL', val_metrics, args.log_interval, batch_step,
                      num_batches, batch, epoch, args.num_epochs, model,
                      device)

        # log eval metrics to tensorboard
        summary_writer.add_scalar('val/error', val_metrics['error'].avg,
                                  global_step)
        summary_writer.add_scalar('val/loss', val_metrics['loss'].avg,
                                  global_step)
        summary_writer.add_scalar('val/batch_time',
                                  val_metrics['batch_time'].avg, global_step)

        # Save best model
        if val_metrics['error'].avg < best_error:
            best_error = val_metrics['error'].avg
            torch.save(model.state_dict(),
                       os.path.join(args.output_dir, 'best.pt'))

    # Finalize aum calculator
    aum_calculator.finalize()

    # Eval best model on on test set
    model.load_state_dict(torch.load(os.path.join(args.output_dir, 'best.pt')))
    test_metrics = {
        'loss': AverageMeter(),
        'error': AverageMeter(),
        'batch_time': AverageMeter()
    }
    num_batches = len(test_loader)
    for batch_step, batch in enumerate(test_loader):
        eval_step(args, 'TEST', test_metrics, args.log_interval, batch_step,
                  num_batches, batch, -1, -1, model, device)

    # log eval metrics to tensorboard
    summary_writer.add_scalar('test/error', test_metrics['error'].avg,
                              global_step)
    summary_writer.add_scalar('test/loss', test_metrics['loss'].avg,
                              global_step)
    summary_writer.add_scalar('test/batch_time',
                              test_metrics['batch_time'].avg, global_step)

    # log test metrics to console
    results = '\t'.join([
        'FINAL TEST RESULTS',
        f'Loss: {test_metrics["loss"].avg:.3f}',
        f'Error: {test_metrics["error"].avg:.3f}',
    ])
    print(results)