Ejemplos de save_checkpoint en Python, ejemplos de training.train_utils.save_checkpoint en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: train_umd_affnet.py Proyecto: akeaveny/PyTorch-Simple-AffNet

def main():

    # Init random seeds.
    random.seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    torch.manual_seed(config.RANDOM_SEED)
    torch.cuda.manual_seed(config.RANDOM_SEED)

    # Setup Tensorboard.
    print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR))
    if not os.path.exists(config.TRAINED_MODELS_DIR):
        os.makedirs(config.TRAINED_MODELS_DIR)
    writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}')

    # Load the Model.
    print()
    model = affnet.ResNetAffNet(pretrained=config.IS_PRETRAINED,
                                num_classes=config.NUM_CLASSES)
    model.to(config.DEVICE)

    # Load the dataset.
    train_loader, val_loader, test_loader = umd_dataset_loaders.load_umd_train_datasets(
    )

    # construct optimizer.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY,
                                momentum=config.MOMENTUM)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES, gamma=config.GAMMA)

    # Main training loop.
    num_epochs = config.NUM_EPOCHS
    best_Fwb = -np.inf

    for epoch in range(0, num_epochs):
        print()

        if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET:
            is_subsample = True
        else:
            is_subsample = False

        # train & val for one epoch
        model, optimizer = train_utils.train_one_epoch(
            model,
            optimizer,
            train_loader,
            config.DEVICE,
            epoch,
            writer,
            is_subsample=is_subsample)
        model, optimizer = train_utils.val_one_epoch(model,
                                                     optimizer,
                                                     val_loader,
                                                     config.DEVICE,
                                                     epoch,
                                                     writer,
                                                     is_subsample=is_subsample)
        # update learning rate.
        lr_scheduler.step()

        # # eval Fwb
        # model, Fwb = eval_utils.affnet_eval_umd(model, test_loader)
        # writer.add_scalar('eval/Fwb', Fwb, int(epoch))
        # # save best model.
        # if Fwb > best_Fwb:
        #     best_Fwb = Fwb
        #     writer.add_scalar('eval/Best_Fwb', best_Fwb, int(epoch))
        #     checkpoint_path = config.BEST_MODEL_SAVE_PATH
        #     train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)
        #     print("Saving best model .. best Fwb={:.5f} ..".format(best_Fwb))
        #
        # checkpoint_path
        checkpoint_path = config.MODEL_SAVE_PATH + 'affnet_epoch_' + np.str(
            epoch) + '.pth'
        train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)

Ejemplo n.º 2

0

Mostrar archivo

def main_worker(gpu_idx, configs):
    configs.gpu_idx = gpu_idx

    if configs.gpu_idx is not None:
        print("Use GPU: {} for training".format(configs.gpu_idx))
        configs.device = torch.device('cuda:{}'.format(configs.gpu_idx))

    if configs.distributed:
        if configs.dist_url == "env://" and configs.rank == -1:
            configs.rank = int(os.environ["RANK"])
        if configs.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx

        dist.init_process_group(backend=configs.dist_backend,
                                init_method=configs.dist_url,
                                world_size=configs.world_size,
                                rank=configs.rank)

    configs.is_master_node = (not configs.distributed) or (
        configs.distributed and (configs.rank % configs.ngpus_per_node == 0))

    if configs.is_master_node:
        logger = Logger(configs.logs_dir, configs.saved_fn)
        logger.info('>>> Created a new logger')
        logger.info('>>> configs: {}'.format(configs))
        tb_writer = SummaryWriter(
            log_dir=os.path.join(configs.logs_dir, 'tensorboard'))
    else:
        logger = None
        tb_writer = None

    # model
    model = get_model(configs)

    # Data Parallel
    model = make_data_parallel(model, configs)

    # Freeze model
    model = freeze_model(model, configs.freeze_modules_list)

    if configs.is_master_node:
        num_parameters = get_num_parameters(model)
        logger.info('number of trained parameters of the model: {}'.format(
            num_parameters))

    optimizer = get_optimizer(configs, model, is_warm_up=False)
    lr_scheduler = get_lr_scheduler(optimizer, configs)
    best_val_loss = np.inf
    earlystop_count = 0

    # optionally load weight from a checkpoint
    if configs.pretrained_path is not None:
        model = load_pretrained_model(model, configs.pretrained_path, gpu_idx,
                                      configs.overwrite_global_2_local)
        if logger is not None:
            logger.info('loaded pretrained model at {}'.format(
                configs.pretrained_path))

    # optionally resume from a checkpoint
    if configs.resume_path is not None:
        checkpoint = resume_model(configs.resume_path, configs.arch,
                                  configs.gpu_idx)
        if hasattr(model, 'module'):
            model.module.load_state_dict(checkpoint['state_dict'])
        else:
            model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        best_val_loss = checkpoint['best_val_loss']
        earlystop_count = checkpoint['earlystop_count']
        configs.start_epoch = checkpoint['epoch'] + 1

    if logger is not None:
        logger.info(">>> Loading dataset & getting dataloader...")
    # Create dataloader
    train_loader, val_loader, train_sampler = create_train_val_dataloader(
        configs)
    if logger is not None:
        logger.info('number of batches in train set: {}'.format(
            len(train_loader)))
        if val_loader is not None:
            logger.info('number of batches in val set: {}'.format(
                len(val_loader)))

    if configs.evaluate:
        assert val_loader is not None, "The validation should not be None"
        val_loss = validate_one_epoch(val_loader, model,
                                      configs.start_epoch - 1, configs, logger)
        print('Evaluate, val_loss: {}'.format(val_loss))
        return

    for epoch in range(configs.start_epoch, configs.num_epochs + 1):
        # Get the current learning rate
        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        if logger is not None:
            logger.info('{}'.format('*-' * 40))
            logger.info('{} {}/{} {}'.format('=' * 35, epoch,
                                             configs.num_epochs, '=' * 35))
            logger.info('{}'.format('*-' * 40))
            logger.info('>>> Epoch: [{}/{}] learning rate: {:.2e}'.format(
                epoch, configs.num_epochs, lr))

        if configs.distributed:
            train_sampler.set_epoch(epoch)
        # train for one epoch
        train_loss = train_one_epoch(train_loader, model, optimizer, epoch,
                                     configs, logger)
        # evaluate on validation set
        if not configs.no_val:
            val_loss = validate_one_epoch(val_loader, model, epoch, configs,
                                          logger)

        # Adjust learning rate
        if configs.lr_type == 'step_lr':
            lr_scheduler.step()
        elif configs.lr_type == 'plateau':
            assert configs.no_val == True, "Only use plateau when having validation set"
            lr_scheduler.step(val_loss)

        if not configs.no_val:
            is_best = val_loss <= best_val_loss
            best_val_loss = min(val_loss, best_val_loss)
            print_string = '\t--- train_loss: {:.4f}, val_loss: {:.4f}, best_val_loss: {:.4f}\t'.format(
                train_loss, val_loss, best_val_loss)
            if tb_writer is not None:
                tb_writer.add_scalars('Loss', {
                    'train': train_loss,
                    'val': val_loss
                }, epoch)

            if configs.is_master_node and (is_best or (
                (epoch % configs.checkpoint_freq) == 0)):
                saved_state = get_saved_state(model, optimizer, lr_scheduler,
                                              epoch, configs, best_val_loss,
                                              earlystop_count)
                save_checkpoint(configs.checkpoints_dir, configs.saved_fn,
                                saved_state, is_best, epoch)

            if configs.earlystop_patience:
                earlystop_count = 0 if is_best else (earlystop_count + 1)
                print_string += ' |||\t earlystop_count: {}'.format(
                    earlystop_count)
                if configs.earlystop_patience <= earlystop_count:
                    print_string += '\n\t--- Early stopping!!!'
                    break
                else:
                    print_string += '\n\t--- Continue training..., earlystop_count: {}'.format(
                        earlystop_count)

            if logger is not None:
                logger.info(print_string)
        else:
            if tb_writer is not None:
                tb_writer.add_scalars('Loss', {'train': train_loss}, epoch)
            if configs.is_master_node and ((epoch % configs.checkpoint_freq)
                                           == 0):
                saved_state = get_saved_state(model, optimizer, lr_scheduler,
                                              epoch, configs, best_val_loss,
                                              earlystop_count)
                save_checkpoint(configs.checkpoints_dir, configs.saved_fn,
                                saved_state, False, epoch)

    if tb_writer is not None:
        tb_writer.close()
    cleanup()

Ejemplo n.º 3

0

Mostrar archivo

Archivo: train_arl_affpose_torchvision_maskrcnn.py Proyecto: akeaveny/PyTorch-Simple-AffNet

def main():

    # Init random seeds.
    random.seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    torch.manual_seed(config.RANDOM_SEED)
    torch.cuda.manual_seed(config.RANDOM_SEED)

    # Setup Tensorboard.
    print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR))
    if not os.path.exists(config.TRAINED_MODELS_DIR):
        os.makedirs(config.TRAINED_MODELS_DIR)
    writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}')

    # Load the Model.
    print()
    # Compare Pytorch-Simple-MaskRCNN. with Torchvision MaskRCNN.
    model = model_utils.get_model_instance_segmentation(
        pretrained=config.IS_PRETRAINED, num_classes=config.NUM_CLASSES)
    model.to(config.DEVICE)

    # Load the dataset.
    train_loader, val_loader, test_loader = arl_affpose_dataset_loaders.load_arl_affpose_train_datasets(
    )

    # Construct an optimizer.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY,
                                momentum=config.MOMENTUM)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES, gamma=config.GAMMA)

    # Main training loop.
    num_epochs = config.NUM_EPOCHS
    best_Fwb, best_mAP = -np.inf, -np.inf

    for epoch in range(0, num_epochs):
        print()

        # train & val for one epoch
        model, optimizer = train_utils.train_one_epoch(model,
                                                       optimizer,
                                                       train_loader,
                                                       config.DEVICE,
                                                       epoch,
                                                       writer,
                                                       is_subsample=True)
        model, optimizer = train_utils.val_one_epoch(model,
                                                     optimizer,
                                                     val_loader,
                                                     config.DEVICE,
                                                     epoch,
                                                     writer,
                                                     is_subsample=True)
        # update learning rate.
        lr_scheduler.step()

        # checkpoint_path
        checkpoint_path = config.MODEL_SAVE_PATH + 'maskrcnn_epoch_' + np.str(
            epoch) + '.pth'
        train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: train_coco_maskrcnn.py Proyecto: akeaveny/PyTorch-Simple-AffNet

def main():

    # Init random seeds.
    random.seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    torch.manual_seed(config.RANDOM_SEED)
    torch.cuda.manual_seed(config.RANDOM_SEED)

    # Setup Tensorboard.
    print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR))
    if not os.path.exists(config.TRAINED_MODELS_DIR):
        os.makedirs(config.TRAINED_MODELS_DIR)
    writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}')

    # Load the Model.
    print()
    model = maskrcnn.ResNetMaskRCNN(pretrained=config.IS_PRETRAINED,
                                    num_classes=config.COCO_NUM_CLASSES)
    model.to(config.DEVICE)

    # Load the dataset.
    train_loader, val_loader = coco_dataset_loaders.load_coco_train_datasets()

    # Construct an optimizer.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY,
                                momentum=config.MOMENTUM)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES, gamma=config.GAMMA)

    # Main training loop.
    num_epochs = config.NUM_EPOCHS
    for epoch in range(0, num_epochs):
        print()

        if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET:
            is_subsample = True
        else:
            is_subsample = False

        # train & val for one epoch
        model, optimizer = train_utils.train_one_epoch(
            model,
            optimizer,
            train_loader,
            config.DEVICE,
            epoch,
            writer,
            is_subsample=is_subsample)
        model, optimizer = train_utils.val_one_epoch(model,
                                                     optimizer,
                                                     val_loader,
                                                     config.DEVICE,
                                                     epoch,
                                                     writer,
                                                     is_subsample=is_subsample)
        # update learning rate.
        lr_scheduler.step()

        # checkpoint_path
        CHECKPOINT_PATH = config.MODEL_SAVE_PATH + 'maskrcnn_epoch_' + np.str(
            epoch) + '.pth'
        train_utils.save_checkpoint(model, optimizer, epoch, CHECKPOINT_PATH)
        print(f'saved model to {CHECKPOINT_PATH} ..')

Ejemplo n.º 5

0

Mostrar archivo

Archivo: train.py Proyecto: xialeiliu/Pytorch-SiamFC

def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       scheduler,
                       loss_fn,
                       metrics,
                       params,
                       exp_dir,
                       args,
                       summ_maker=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object
            that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
            fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential
            learning rate scheduler.
        loss_fn: a function that takes batch_output and batch_labels and
            computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using
            the output and labels of each batch
        params: (Params) hyperparameters
        exp_dir: (string) directory containing the parameters, weights and
            logs for the current experiment. The full path.
        args: The parser object containing the user informed arguments
        summ_maker: The SummaryMaker object that writes the training information
        to a tensorboard-readable file.
    """
    # reload weights from restore_file if specified
    # TODO load and set best validation error
    if args.restore_file is not None:
        restore_path = join(exp_dir, (args.restore_file + '.pth.tar'))
        logging.info("Restoring parameters from {}".format(restore_path))
        train_utils.load_checkpoint(restore_path, model)

    # best_val_c_error = float("inf")
    best_val_auc = 0
    # Before starting the first epoch do the eval
    logging.info('Pretraining evaluation...')
    # Epoch 0 is the validation epoch before the learning starts.
    summ_maker.epoch = 0
    val_metrics = evaluate(model,
                           loss_fn,
                           val_dataloader,
                           metrics,
                           params,
                           args,
                           summ_maker=summ_maker)

    for epoch in range(params.num_epochs):
        # The first epoch after training is 1 not 0
        summ_maker.epoch = epoch + 1
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model,
              optimizer,
              loss_fn,
              train_dataloader,
              metrics,
              params,
              summ_maker=summ_maker)

        # Update the Learning rate
        scheduler.step()

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model,
                               loss_fn,
                               val_dataloader,
                               metrics,
                               params,
                               args,
                               summ_maker=summ_maker)

        val_auc = val_metrics['AUC']
        is_best = val_auc >= best_val_auc

        # Save weights
        train_utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=exp_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best auc")
            best_val_auc = val_auc

            # Save best val metrics in a json file in the model directory
            best_json_path = join(exp_dir, "metrics_val_best_weights.json")
            train_utils.save_dict_to_json(val_metrics, best_json_path)
            pass

        # Save latest val metrics in a json file in the model directory
        last_json_path = join(exp_dir, "metrics_val_last_weights.json")
        train_utils.save_dict_to_json(val_metrics, last_json_path)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: train_arl_affpose_affnet.py Proyecto: akeaveny/PyTorch-Simple-AffNet

def main():

    # Init random seeds.
    random.seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    torch.manual_seed(config.RANDOM_SEED)
    torch.cuda.manual_seed(config.RANDOM_SEED)

    # Setup Tensorboard.
    print('\nsaving run in .. {}'.format(config.TRAINED_MODELS_DIR))
    if not os.path.exists(config.TRAINED_MODELS_DIR):
        os.makedirs(config.TRAINED_MODELS_DIR)
    writer = SummaryWriter(f'{config.TRAINED_MODELS_DIR}')

    # Load the Model.
    print()
    model = affnet.ResNetAffNet(pretrained=config.IS_PRETRAINED,
                                num_classes=config.NUM_CLASSES)
    model.to(config.DEVICE)
    torch.cuda.empty_cache()

    # # TODO: Freeze the backbone.
    # model = model_utils.freeze_backbone(model, verbose=True)

    # TODO: Load saved weights.
    print(
        f"\nrestoring pre-trained AffNet weights: {config.RESTORE_SYN_ARL_AFFNET_WEIGHTS} .. "
    )
    checkpoint = torch.load(config.RESTORE_SYN_ARL_AFFNET_WEIGHTS,
                            map_location=config.DEVICE)
    model.load_state_dict(checkpoint["model"])
    model.to(config.DEVICE)

    # Load the dataset.
    train_loader, val_loader, test_loader = arl_affpose_dataset_loaders.load_arl_affpose_train_datasets(
    )

    # Construct an optimizer.
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=config.LEARNING_RATE,
                                weight_decay=config.WEIGHT_DECAY,
                                momentum=config.MOMENTUM)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=config.MILESTONES, gamma=config.GAMMA)
    # # TODO: Load saved weights.
    # optimizer.load_state_dict(checkpoint["optimizer"])

    # Main training loop.
    num_epochs = config.NUM_EPOCHS
    best_Fwb, best_mAP = -np.inf, -np.inf

    for epoch in range(0, num_epochs):
        print()

        if epoch < config.EPOCH_TO_TRAIN_FULL_DATASET:
            is_subsample = True
        else:
            is_subsample = False

        # train & val for one epoch
        model, optimizer = train_utils.train_one_epoch(
            model,
            optimizer,
            train_loader,
            config.DEVICE,
            epoch,
            writer,
            is_subsample=is_subsample)
        model, optimizer = train_utils.val_one_epoch(model,
                                                     optimizer,
                                                     val_loader,
                                                     config.DEVICE,
                                                     epoch,
                                                     writer,
                                                     is_subsample=is_subsample)
        # update learning rate.
        lr_scheduler.step()

        model, mAP, Fwb = eval_utils.affnet_eval_arl_affpose(
            model, test_loader)
        # eval FwB
        writer.add_scalar('eval/Fwb', Fwb, int(epoch))
        if Fwb > best_Fwb:
            best_Fwb = Fwb
            writer.add_scalar('eval/Best_Fwb', best_Fwb, int(epoch))
            checkpoint_path = config.BEST_MODEL_SAVE_PATH
            train_utils.save_checkpoint(model, optimizer, epoch,
                                        checkpoint_path)
            print("Saving best model .. best Fwb={:.5f} ..".format(best_Fwb))
        # eval mAP
        writer.add_scalar('eval/mAP', mAP, int(epoch))
        if mAP > best_mAP:
            best_mAP = mAP
            writer.add_scalar('eval/Best_mAP', best_mAP, int(epoch))

        # checkpoint_path
        checkpoint_path = config.MODEL_SAVE_PATH + 'affnet_epoch_' + np.str(
            epoch) + '.pth'
        train_utils.save_checkpoint(model, optimizer, epoch, checkpoint_path)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: main_worker.py Proyecto: maodeshu/CLSA

def main_worker(gpu, ngpus_per_node, args):
    """
    :param gpu: current gpu id
    :param ngpus_per_node: number of gpus in one node
    :param args: config parameter
    :return:
    init training setup and iteratively training
    """
    params = vars(args)
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
    print("=> creating model '{}'".format(args.arch))
    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    #init model
    model = CLSA(models.__dict__[args.arch], args, args.moco_dim, args.moco_k,
                 args.moco_m, args.moco_t, args.mlp)
    print(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)

            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            exit()

    cudnn.benchmark = True
    # config data loader
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    fix_transform = Multi_Fixtransform(args.size_crops, args.nmb_crops,
                                       args.min_scale_crops,
                                       args.max_scale_crops, normalize,
                                       args.aug_times)
    traindir = os.path.join(args.data, 'train')
    train_dataset = datasets.ImageFolder(traindir, fix_transform)
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)
    save_path = init_log_path(args)  #config model save path and log path
    log_path = os.path.join(save_path, "train.log")
    best_Acc = 0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)
        acc1 = train(train_loader, model, criterion, optimizer, epoch, args,
                     log_path)
        is_best = best_Acc > acc1
        best_Acc = max(best_Acc, acc1)
        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_dict = {
                'epoch': epoch + 1,
                'arch': args.arch,
                'best_acc': best_Acc,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }

            if epoch % 10 == 9:
                tmp_save_path = os.path.join(
                    save_path, 'checkpoint_{:04d}.pth.tar'.format(epoch))
                save_checkpoint(save_dict,
                                is_best=False,
                                filename=tmp_save_path)
            tmp_save_path = os.path.join(save_path, 'checkpoint_best.pth.tar')
            save_checkpoint(save_dict, is_best=is_best, filename=tmp_save_path)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: train.py Proyecto: vero1925/Pytorch-SiamFC

def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, scheduler,
                       loss_fn, metrics, params, exp_dir, args, summ_maker=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object
            that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that
            fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        scheduler: (torch.optim.lr_scheduler.ExponentialLR) The exponential
            learning rate scheduler.
        loss_fn: a function that takes batch_output and batch_labels and
            computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using
            the output and labels of each batch
        params: (Params) hyperparameters
        exp_dir: (string) directory containing the parameters, weights and
            logs for the current experiment. The full path.
        args: The parser object containing the user informed arguments
        summ_maker: The SummaryMaker object that writes the training information
        to a tensorboard-readable file.
    """
    # reload weights from restore_file if specified
    # TODO load and set best validation error
    if args.restore_file is not None:
        restore_path = join(exp_dir, (args.restore_file + '.pth.tar'))
        logging.info("Restoring parameters from {}".format(restore_path))
        train_utils.load_checkpoint(restore_path, model)

    # best_val_c_error = float("inf")
    best_val_auc = 0
    # Before starting the first epoch do the eval
    logging.info('Pretraining evaluation...')
    # Epoch 0 is the validation epoch before the learning starts.
    summ_maker.epoch = 0
    val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, args,
                           summ_maker=summ_maker)

    for epoch in range(params.num_epochs):
        # The first epoch after training is 1 not 0
        summ_maker.epoch = epoch + 1
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params,
              summ_maker=summ_maker)

        # Update the Learning rate
        scheduler.step()

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params,
                               args, summ_maker=summ_maker)

        val_auc = val_metrics['AUC']
        is_best = val_auc >= best_val_auc

        # Save weights
        train_utils.save_checkpoint({'epoch': epoch + 1,
                                     'state_dict': model.state_dict(),
                                     'optim_dict': optimizer.state_dict()},
                                    is_best=is_best,
                                    checkpoint=exp_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best auc")
            best_val_auc = val_auc

            # Save best val metrics in a json file in the model directory
            best_json_path = join(exp_dir, "metrics_val_best_weights.json")
            train_utils.save_dict_to_json(val_metrics, best_json_path)
            pass

        # Save latest val metrics in a json file in the model directory
        last_json_path = join(exp_dir, "metrics_val_last_weights.json")
        train_utils.save_dict_to_json(val_metrics, last_json_path)