Esempio n. 1
0
    def __init__(self, unsupervised_path):
        super().__init__()
        self.save_hyperparameters()

        self.feat_ext = LinearClassifier(network='s3d',
                                         num_class=101,
                                         dropout=0.9,
                                         use_dropout=True,
                                         use_final_bn=False,
                                         use_l2_norm=False)
        checkpoint = torch.load(unsupervised_path)
        state_dict = checkpoint['state_dict']
        new_dict = {}
        for k, v in state_dict.items():
            k = k.replace('encoder_q.0.', 'backbone.')
            new_dict[k] = v
        state_dict = new_dict
        neq_load_customized(self.feat_ext, state_dict, verbose=False)
Esempio n. 2
0
File: main.py Progetto: surisdi/DPC
def main():
    args = get_args()

    # Fix randomness
    seed = args.seed
    torch.cuda.manual_seed_all(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # ---------------------------- Prepare model ----------------------------- #
    if args.local_rank <= 0:
        print_r(args, 'Preparing model')

    model = models.Model(args)
    model = model.to(args.device)

    params = model.parameters()
    optimizer = geoopt.optim.RiemannianAdam(params,
                                            lr=args.lr,
                                            weight_decay=args.wd,
                                            stabilize=10)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[80, 150],
                                                     gamma=0.1)

    best_acc = 0
    iteration = 0

    # --- restart training --- #
    if args.resume:
        if os.path.isfile(args.resume):
            print_r(args, f"=> loading resumed checkpoint '{args.resume}'")
            checkpoint = torch.load(args.resume,
                                    map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict'], strict=True)
            scheduler.load_state_dict(checkpoint['scheduler'])
            if not args.reset_lr:  # if didn't reset lr, load old optimizer
                optimizer.load_state_dict(checkpoint['optimizer'])
            else:
                print_r(
                    args,
                    f'==== Restart optimizer with a learning rate {args.lr} ===='
                )
            print_r(
                args,
                f"=> loaded resumed checkpoint '{args.resume}' (epoch {checkpoint['epoch']})"
            )
        else:
            print_r(args,
                    f"[Warning] no checkpoint found at '{args.resume}'",
                    print_no_verbose=True)

    elif args.pretrain:  # resume overwrites this
        if os.path.isfile(args.pretrain):
            print_r(args,
                    f"=> loading pretrained checkpoint '{args.pretrain}'")
            checkpoint = torch.load(args.pretrain,
                                    map_location=torch.device('cpu'))
            model = neq_load_customized(args,
                                        model,
                                        checkpoint['state_dict'],
                                        parts='all',
                                        size_diff=args.final_2dim
                                        or args.feature_dim != -1)
            print_r(
                args,
                f"=> loaded pretrained checkpoint '{args.pretrain}' (epoch {checkpoint['epoch']})"
            )
        else:
            print_r(args,
                    f"=> no checkpoint found at '{args.pretrain}'",
                    print_no_verbose=True)

        if args.only_train_linear:
            for name, param in model.named_parameters():  # deleted 'module'
                if 'network_class' not in name:
                    param.requires_grad = False
        print_r(
            args,
            '\n==== parameter names and whether they require gradient ====\n')
        for name, param in model.named_parameters():
            print_r(args, (name, param.requires_grad))
        print_r(args, '\n==== start dataloading ====\n')

    if args.local_rank != -1:
        from torch.nn.parallel import DistributedDataParallel as DDP
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
            model) if not args.not_track_running_stats else model
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank)
        args.parallel = 'ddp'
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
        args.parallel = 'dp'
    else:
        args.parallel = 'none'

    # ---------------------------- Prepare dataset ----------------------------- #
    splits = ['train', 'val', 'test']
    loaders = {
        split: datasets.get_data(args,
                                 split,
                                 return_label=args.use_labels,
                                 hierarchical_label=args.hierarchical_labels,
                                 action_level_gt=args.action_level_gt,
                                 num_workers=args.num_workers,
                                 path_dataset=args.path_dataset)
        for split in splits
    }

    # setup tools
    img_path, model_path = set_path(args)
    writer_val = SummaryWriter(
        log_dir=os.path.join(img_path, 'val') if not args.debug else '/tmp'
    ) if args.local_rank <= 0 else None
    writer_train = SummaryWriter(
        log_dir=os.path.join(img_path, 'train') if not args.debug else '/tmp'
    ) if args.local_rank <= 0 else None

    # ---------------------------- Prepare trainer and run ----------------------------- #
    if args.local_rank <= 0:
        print_r(args, 'Preparing trainer')
    trainer = Trainer(args, model, optimizer, loaders, iteration, best_acc,
                      writer_train, writer_val, img_path, model_path,
                      scheduler)

    if args.test:
        trainer.test()
    else:
        trainer.train()
Esempio n. 3
0
def main_worker(gpu, ngpus_per_node, args):
    best_acc = 0
    args.gpu = gpu

    if args.distributed:
        if args.local_rank != -1:
            args.rank = args.local_rank
            args.gpu = args.local_rank
        elif 'SLURM_PROCID' in os.environ: # slurm scheduler
            args.rank = int(os.environ['SLURM_PROCID'])
            args.gpu = args.rank % torch.cuda.device_count()
        elif args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
            if args.multiprocessing_distributed:
                # For multiprocessing distributed training, rank needs to be the
                # global rank among all the processes
                args.rank = args.rank * ngpus_per_node + gpu
        
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)

    args.print = args.gpu == 0
    # suppress printing if not master
    if (args.multiprocessing_distributed and args.gpu != 0) or\
       (args.local_rank != -1 and args.gpu != 0) or\
       ('SLURM_PROCID' in os.environ and args.rank!=0):
        def print_pass(*args):
            pass
        builtins.print = print_pass

    ### model ###
    print("=> creating {} model with '{}' backbone".format(args.model, args.net))
    if args.model == 'coclr':
        model = CoCLR(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, topk=args.topk, reverse=args.reverse)
        if args.reverse:
            print('[Warning] using RGB-Mining to help flow')
        else:
            print('[Warning] using Flow-Mining to help RGB')
    else:
        raise NotImplementedError
    args.num_seq = 2
    print('Re-write num_seq to %d' % args.num_seq)
        
    args.img_path, args.model_path, args.exp_path = set_path(args)

    # print(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
            model_without_ddp = model.module
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
            model_without_ddp = model.module
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")


    ### optimizer ###
    params = []
    for name, param in model.named_parameters():
        params.append({'params': param})

    print('\n===========Check Grad============')
    for name, param in model.named_parameters():
        print(name, param.requires_grad)
    print('=================================\n')

    optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)

    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    args.iteration = 1

    ### data ###  
    transform_train = get_transform('train', args)
    train_loader = get_dataloader(get_data(transform_train, 'train', args), 'train', args)
    transform_train_cuda = transforms.Compose([
                T.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225], channel=1)])
    n_data = len(train_loader.dataset)

    print('===================================')

    lr_scheduler = None

    ### restart training ### 
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch']+1
            args.iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            state_dict = checkpoint['state_dict']

            try: model_without_ddp.load_state_dict(state_dict)
            except: 
                print('[WARNING] Non-Equal load for resuming training!')
                neq_load_customized(model_without_ddp, state_dict, verbose=True)

            print("=> load resumed checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
            try: optimizer.load_state_dict(checkpoint['optimizer'])
            except: print('[WARNING] Not loading optimizer states')
        else:
            print("[Warning] no checkpoint found at '{}', use random init".format(args.resume))

    elif args.pretrain != ['random', 'random']:
        # first path: weights to be trained
        # second path: weights as the oracle, not trained
        if os.path.isfile(args.pretrain[1]): # second network --> load as sampler
            checkpoint = torch.load(args.pretrain[1], map_location=torch.device('cpu'))
            second_dict = checkpoint['state_dict']
            new_dict = {}
            for k,v in second_dict.items(): # only take the encoder_q
                if 'encoder_q.' in k:
                    k = k.replace('encoder_q.', 'sampler.')
                    new_dict[k] = v
            second_dict = new_dict

            new_dict = {} # remove queue, queue_ptr
            for k, v in second_dict.items():
                if 'queue' not in k:
                    new_dict[k] = v 
            second_dict = new_dict
            print("=> Use Oracle checkpoint '{}' (epoch {})".format(args.pretrain[1], checkpoint['epoch']))
        else:
            print("=> NO Oracle checkpoint found at '{}', use random init".format(args.pretrain[1]))
            second_dict = {}

        if os.path.isfile(args.pretrain[0]): # first network --> load both encoder q & k
            checkpoint = torch.load(args.pretrain[0], map_location=torch.device('cpu'))
            first_dict = checkpoint['state_dict']

            new_dict = {} # remove queue, queue_ptr
            for k, v in first_dict.items():
                if 'queue' not in k:
                    new_dict[k] = v 
            first_dict = new_dict

            # update both q and k with q
            new_dict = {}
            for k,v in first_dict.items(): # only take the encoder_q
                if 'encoder_q.' in k:
                    new_dict[k] = v
                    k = k.replace('encoder_q.', 'encoder_k.')
                    new_dict[k] = v
            first_dict = new_dict
            
            print("=> Use Training checkpoint '{}' (epoch {})".format(args.pretrain[0], checkpoint['epoch']))
        else:
            print("=> NO Training checkpoint found at '{}', use random init".format(args.pretrain[0]))
            first_dict = {}

        state_dict = {**first_dict, **second_dict}
        try:
            del state_dict['queue_label'] # always re-fill the queue
        except:
            pass 
        neq_load_customized(model_without_ddp, state_dict, verbose=True)

    else:
        print("=> train from scratch")

    torch.backends.cudnn.benchmark = True

    # tensorboard plot tools
    writer_train = SummaryWriter(logdir=os.path.join(args.img_path, 'train'))
    args.train_plotter = TB.PlotterThread(writer_train)
    
    ### main loop ###    
    for epoch in range(args.start_epoch, args.epochs):
        np.random.seed(epoch)
        random.seed(epoch)
        
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        _, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, transform_train_cuda, epoch, args)
        if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1):         
            # save check_point on rank==0 worker
            if (not args.multiprocessing_distributed and args.rank == 0) \
                or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
                is_best = train_acc > best_acc
                best_acc = max(train_acc, best_acc)
                state_dict = model_without_ddp.state_dict()
                save_dict = {
                    'epoch': epoch,
                    'state_dict': state_dict,
                    'best_acc': best_acc,
                    'optimizer': optimizer.state_dict(),
                    'iteration': args.iteration}
                save_checkpoint(save_dict, is_best, gap=args.save_freq, 
                    filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), 
                    keep_all='k400' in args.dataset)
    
    print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs))
    sys.exit(0)
Esempio n. 4
0
def main(args):
    if args.gpu is None:
        args.gpu = str(os.environ["CUDA_VISIBLE_DEVICES"])
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    device = torch.device('cuda')

    best_acc = 0
    torch.manual_seed(0)
    np.random.seed(0)
    random.seed(0)

    num_gpu = len(str(args.gpu).split(','))
    args.batch_size = num_gpu * args.batch_size
    print('=> Effective BatchSize = %d' % args.batch_size)
    args.img_path, args.model_path, args.exp_path = set_path(args)

    ### classifier model ###
    num_class_dict = {
        'ucf101': 101,
        'hmdb51': 51,
        'k400': 400,
        'ucf101-f': 101,
        'hmdb51-f': 51,
        'k400-f': 400
    }
    args.num_class = num_class_dict[args.dataset]

    if args.train_what == 'last':  # for linear probe
        args.final_bn = True
        args.final_norm = True
        args.use_dropout = False
    else:  # for training the entire network
        args.final_bn = False
        args.final_norm = False
        args.use_dropout = True

    if args.model == 'lincls':
        model = LinearClassifier(network=args.net,
                                 num_class=args.num_class,
                                 dropout=args.dropout,
                                 use_dropout=args.use_dropout,
                                 use_final_bn=args.final_bn,
                                 use_l2_norm=args.final_norm)
    else:
        raise NotImplementedError

    model.to(device)

    ### optimizer ###
    if args.train_what == 'last':
        print('=> [optimizer] only train last layer')
        params = []
        for name, param in model.named_parameters():
            if 'backbone' in name:
                param.requires_grad = False
            else:
                params.append({'params': param})

    elif args.train_what == 'ft':
        print('=> [optimizer] finetune backbone with smaller lr')
        params = []
        for name, param in model.named_parameters():
            if 'backbone' in name:
                params.append({'params': param, 'lr': args.lr / 10})
            else:
                params.append({'params': param})

    else:  # train all
        params = []
        print('=> [optimizer] train all layer')
        for name, param in model.named_parameters():
            params.append({'params': param})

    if args.train_what == 'last':
        print('\n===========Check Grad============')
        for name, param in model.named_parameters():
            if param.requires_grad:
                print(name, param.requires_grad)
        print('=================================\n')

    if args.optim == 'adam':
        optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(params,
                              lr=args.lr,
                              weight_decay=args.wd,
                              momentum=0.9)
    else:
        raise NotImplementedError

    model = torch.nn.DataParallel(model)
    model_without_dp = model.module

    ce_loss = nn.CrossEntropyLoss()
    args.iteration = 1

    ### test: higher priority ###
    if args.test:
        if os.path.isfile(args.test):
            print("=> loading testing checkpoint '{}'".format(args.test))
            checkpoint = torch.load(args.test,
                                    map_location=torch.device('cpu'))
            epoch = checkpoint['epoch']
            state_dict = checkpoint['state_dict']

            if args.retrieval_ucf or args.retrieval_full:  # if directly test on pretrained network
                new_dict = {}
                for k, v in state_dict.items():
                    k = k.replace('encoder_q.0.', 'backbone.')
                    new_dict[k] = v
                state_dict = new_dict

            try:
                model_without_dp.load_state_dict(state_dict)
            except:
                neq_load_customized(model_without_dp, state_dict, verbose=True)

        else:
            print("[Warning] no checkpoint found at '{}'".format(args.test))
            epoch = 0
            print("[Warning] if test random init weights, press c to continue")
            import ipdb
            ipdb.set_trace()

        args.logger = Logger(path=os.path.dirname(args.test))
        args.logger.log('args=\n\t\t' + '\n\t\t'.join(
            ['%s:%s' % (str(k), str(v)) for k, v in vars(args).items()]))

        transform_test_cuda = transforms.Compose([
            T.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225],
                        channel=1)
        ])

        if args.retrieval:
            test_retrieval(model, ce_loss, transform_test_cuda, device, epoch,
                           args)
        elif args.center_crop or args.five_crop or args.ten_crop:
            transform = get_transform('test', args)
            test_dataset = get_data(transform, 'test', args)
            test_10crop(test_dataset, model, ce_loss, transform_test_cuda,
                        device, epoch, args)
        else:
            raise NotImplementedError

        sys.exit(0)

    ### data ###
    transform_train = get_transform('train', args)
    train_loader = get_dataloader(get_data(transform_train, 'train', args),
                                  'train', args)
    transform_val = get_transform('val', args)
    val_loader = get_dataloader(get_data(transform_val, 'val', args), 'val',
                                args)

    transform_train_cuda = transforms.Compose([
        T.RandomHorizontalFlip(),
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225],
                    channel=1)
    ])  # ImageNet
    transform_val_cuda = transforms.Compose([
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225],
                    channel=1)
    ])  # ImageNet

    print('===================================')

    ### restart training ###
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch'] + 1
            args.iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            state_dict = checkpoint['state_dict']

            try:
                model_without_dp.load_state_dict(state_dict)
            except:
                print('[WARNING] resuming training with different weights')
                neq_load_customized(model_without_dp, state_dict, verbose=True)
            print("=> load resumed checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))

            try:
                optimizer.load_state_dict(checkpoint['optimizer'])
            except:
                print(
                    '[WARNING] failed to load optimizer state, initialize optimizer'
                )
        else:
            print("[Warning] no checkpoint found at '{}', use random init".
                  format(args.resume))

    elif args.pretrain:
        if os.path.isfile(args.pretrain):
            checkpoint = torch.load(args.pretrain, map_location='cpu')
            state_dict = checkpoint['state_dict']

            new_dict = {}
            for k, v in state_dict.items():
                k = k.replace('encoder_q.0.', 'backbone.')
                new_dict[k] = v
            state_dict = new_dict

            try:
                model_without_dp.load_state_dict(state_dict)
            except:
                neq_load_customized(model_without_dp, state_dict, verbose=True)
            print("=> loaded pretrained checkpoint '{}' (epoch {})".format(
                args.pretrain, checkpoint['epoch']))
        else:
            print("[Warning] no checkpoint found at '{}', use random init".
                  format(args.pretrain))

    else:
        print("=> train from scratch")

    torch.backends.cudnn.benchmark = True

    # plot tools
    writer_val = SummaryWriter(logdir=os.path.join(img_path, 'val'))
    writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train'))
    args.val_plotter = TB.PlotterThread(writer_val)
    args.train_plotter = TB.PlotterThread(writer_train)

    args.logger = Logger(path=args.img_path)
    args.logger.log('args=\n\t\t' + '\n\t\t'.join(
        ['%s:%s' % (str(k), str(v)) for k, v in vars(args).items()]))

    # main loop
    for epoch in range(args.start_epoch, args.epochs):
        np.random.seed(epoch)
        random.seed(epoch)

        adjust_learning_rate(optimizer, epoch, args)

        train_one_epoch(train_loader, model, ce_loss, optimizer,
                        transform_train_cuda, device, epoch, args)

        if epoch % args.eval_freq == 0:
            _, val_acc = validate(val_loader, model, ce_loss,
                                  transform_val_cuda, device, epoch, args)

            # save check_point
            is_best = val_acc > best_acc
            best_acc = max(val_acc, best_acc)
            state_dict = model_without_dp.state_dict()
            save_dict = {
                'epoch': epoch,
                'state_dict': state_dict,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
                'iteration': args.iteration
            }
            save_checkpoint(save_dict,
                            is_best,
                            1,
                            filename=os.path.join(args.model_path,
                                                  'epoch%d.pth.tar' % epoch),
                            keep_all=False)

    print('Training from ep %d to ep %d finished' %
          (args.start_epoch, args.epochs))
    sys.exit(0)
Esempio n. 5
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    if args.local_rank != -1: 
        args.gpu = args.local_rank
        torch.cuda.set_device(args.gpu)
    best_acc = 0

    args.print = args.gpu == 0
    # suppress printing if not master
    if (args.multiprocessing_distributed and args.gpu != 0) or\
       (args.local_rank != -1 and args.gpu != 0):
        def print_pass(*args):
            pass
        builtins.print = print_pass

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        if args.local_rank != -1:
            args.rank = args.local_rank
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)

    ### model ###
    print("=> creating {} model with '{}' backbone".format(args.model, args.net))
    if args.model == 'infonce':
        model = InfoNCE(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t)
    elif args.model == 'ubernce':
        model = UberNCE(args.net, args.moco_dim, args.moco_k, args.moco_m, args.moco_t)
    
    args.num_seq = 2
    print('Re-write num_seq to %d' % args.num_seq)

    args.img_path, args.model_path, args.exp_path = set_path(args)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
            model_without_ddp = model.module
        else:
            model.cuda()
            model = torch.nn.parallel.DistributedDataParallel(model)
            model_without_ddp = model.module
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")


    ### optimizer ###
    params = []
    if args.train_what == 'all':
        for name, param in model.named_parameters():
            params.append({'params': param})
    else:
        raise NotImplementedError

    print('\n===========Check Grad============')
    for name, param in model.named_parameters():
        if not param.requires_grad:
            print(name, param.requires_grad)
    print('=================================\n')

    optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    args.iteration = 1

    ### data ###  
    transform_train = get_transform('train', args)
    train_loader = get_dataloader(get_data(transform_train, 'train', args), 'train', args)
    transform_train_cuda = transforms.Compose([
                T.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225], channel=1)])
    n_data = len(train_loader.dataset)

    print('===================================')

    ### restart training ### 
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch']+1
            args.iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            state_dict = checkpoint['state_dict']

            try: model_without_ddp.load_state_dict(state_dict)
            except: 
                print('[WARNING] resuming training with different weights')
                neq_load_customized(model_without_ddp, state_dict, verbose=True)

            print("=> load resumed checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
            try: optimizer.load_state_dict(checkpoint['optimizer'])
            except: print('[WARNING] failed to load optimizer state, initialize optimizer')
        else:
            print("[Warning] no checkpoint found at '{}', use random init".format(args.resume))
    
    elif args.pretrain:
        if os.path.isfile(args.pretrain):
            checkpoint = torch.load(args.pretrain, map_location=torch.device('cpu'))
            state_dict = checkpoint['state_dict']
                
            try: model_without_ddp.load_state_dict(state_dict)
            except: neq_load_customized(model_without_ddp, state_dict, verbose=True)
            print("=> loaded pretrained checkpoint '{}' (epoch {})".format(args.pretrain, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}', use random init".format(args.pretrain))
    
    else:
        print("=> train from scratch")

    torch.backends.cudnn.benchmark = True

    # tensorboard plot tools
    writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train'))
    args.train_plotter = TB.PlotterThread(writer_train)

    ### main loop ###    
    for epoch in range(args.start_epoch, args.epochs):
        np.random.seed(epoch)
        random.seed(epoch)
        
        if args.distributed:
            train_loader.sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        _, train_acc = train_one_epoch(train_loader, model, criterion, optimizer, lr_scheduler, transform_train_cuda, epoch, args)
        
        if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1): 
            # save check_point on rank==0 worker
            if (not args.multiprocessing_distributed and args.rank == 0) \
                or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
                is_best = train_acc > best_acc
                best_acc = max(train_acc, best_acc)
                state_dict = model_without_ddp.state_dict()
                save_dict = {
                    'epoch': epoch,
                    'state_dict': state_dict,
                    'best_acc': best_acc,
                    'optimizer': optimizer.state_dict(),
                    'iteration': args.iteration}
                save_checkpoint(save_dict, is_best, gap=args.save_freq, 
                    filename=os.path.join(args.model_path, 'epoch%d.pth.tar' % epoch), 
                    keep_all='k400' in args.dataset)
    
    print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs))
    sys.exit(0)
Esempio n. 6
0
def main(args):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    device = torch.device('cuda')
    num_gpu = len(str(args.gpu).split(','))
    args.batch_size = num_gpu * args.batch_size

    if args.dataset == 'ucf101': args.num_class = 101
    elif args.dataset == 'hmdb51': args.num_class = 51

    ### classifier model ###
    if args.model == 'lc':
        model = LC(sample_size=args.img_dim,
                   num_seq=args.num_seq,
                   seq_len=args.seq_len,
                   network=args.net,
                   num_class=args.num_class,
                   dropout=args.dropout,
                   train_what=args.train_what)
    else:
        raise ValueError('wrong model!')

    model.to(device)
    model = nn.DataParallel(model)
    model_without_dp = model.module
    criterion = nn.CrossEntropyLoss()

    ### optimizer ###
    params = None
    if args.train_what == 'ft':
        print('=> finetune backbone with smaller lr')
        params = []
        for name, param in model.module.named_parameters():
            if ('resnet' in name) or ('rnn' in name):
                params.append({'params': param, 'lr': args.lr / 10})
            else:
                params.append({'params': param})
    elif args.train_what == 'last':
        print('=> train only last layer')
        params = []
        for name, param in model.named_parameters():
            if ('bone' in name) or ('agg' in name) or ('mb' in name) or (
                    'network_pred' in name):
                param.requires_grad = False
            else:
                params.append({'params': param})
    else:
        pass  # train all layers

    print('\n===========Check Grad============')
    for name, param in model.named_parameters():
        print(name, param.requires_grad)
    print('=================================\n')

    if params is None: params = model.parameters()
    optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)

    ### scheduler ###
    if args.dataset == 'hmdb51':
        step = args.schedule
        if step == []: step = [150, 250]
        lr_lambda = lambda ep: MultiStepLR_Restart_Multiplier(
            ep, gamma=0.1, step=step, repeat=1)
    elif args.dataset == 'ucf101':
        step = args.schedule
        if step == []: step = [300, 400]
        lr_lambda = lambda ep: MultiStepLR_Restart_Multiplier(
            ep, gamma=0.1, step=step, repeat=1)
    lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
    print('=> Using scheduler at {} epochs'.format(step))

    args.old_lr = None
    best_acc = 0
    args.iteration = 1

    ### if in test mode ###
    if args.test:
        if os.path.isfile(args.test):
            print("=> loading test checkpoint '{}'".format(args.test))
            checkpoint = torch.load(args.test,
                                    map_location=torch.device('cpu'))
            try:
                model_without_dp.load_state_dict(checkpoint['state_dict'])
            except:
                print(
                    '=> [Warning]: weight structure is not equal to test model; Load anyway =='
                )
                model_without_dp = neq_load_customized(
                    model_without_dp, checkpoint['state_dict'])
            epoch = checkpoint['epoch']
            print("=> loaded testing checkpoint '{}' (epoch {})".format(
                args.test, checkpoint['epoch']))
        elif args.test == 'random':
            epoch = 0
            print("=> loaded random weights")
        else:
            print("=> no checkpoint found at '{}'".format(args.test))
            sys.exit(0)

        args.logger = Logger(path=os.path.dirname(args.test))
        _, test_dataset = get_data(None, 'test')
        test_loss, test_acc = test(test_dataset, model, criterion, device,
                                   epoch, args)
        sys.exit()

    ### restart training ###
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading resumed checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume,
                                    map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch']
            args.iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            model_without_dp.load_state_dict(checkpoint['state_dict'])
            try:
                optimizer.load_state_dict(checkpoint['optimizer'])
            except:
                print('[WARNING] Not loading optimizer states')
            print("=> loaded resumed checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            sys.exit(0)

    if (not args.resume) and args.pretrain:
        if args.pretrain == 'random':
            print('=> using random weights')
        elif os.path.isfile(args.pretrain):
            print("=> loading pretrained checkpoint '{}'".format(
                args.pretrain))
            checkpoint = torch.load(args.pretrain,
                                    map_location=torch.device('cpu'))
            model_without_dp = neq_load_customized(model_without_dp,
                                                   checkpoint['state_dict'])
            print("=> loaded pretrained checkpoint '{}' (epoch {})".format(
                args.pretrain, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.pretrain))
            sys.exit(0)

    ### data ###
    transform = transforms.Compose([
        A.RandomSizedCrop(consistent=True, size=224, p=1.0),
        A.Scale(size=(args.img_dim, args.img_dim)),
        A.RandomHorizontalFlip(consistent=True),
        A.ColorJitter(brightness=0.5,
                      contrast=0.5,
                      saturation=0.5,
                      hue=0.25,
                      p=0.3,
                      consistent=True),
        A.ToTensor(),
        A.Normalize()
    ])
    val_transform = transforms.Compose([
        A.RandomSizedCrop(consistent=True, size=224, p=0.3),
        A.Scale(size=(args.img_dim, args.img_dim)),
        A.RandomHorizontalFlip(consistent=True),
        A.ColorJitter(brightness=0.2,
                      contrast=0.2,
                      saturation=0.2,
                      hue=0.1,
                      p=0.3,
                      consistent=True),
        A.ToTensor(),
        A.Normalize()
    ])

    train_loader, _ = get_data(transform, 'train')
    val_loader, _ = get_data(val_transform, 'val')

    # setup tools
    args.img_path, args.model_path = set_path(args)
    args.writer_val = SummaryWriter(logdir=os.path.join(args.img_path, 'val'))
    args.writer_train = SummaryWriter(
        logdir=os.path.join(args.img_path, 'train'))
    torch.backends.cudnn.benchmark = True

    ### main loop ###
    for epoch in range(args.start_epoch, args.epochs):
        train_loss, train_acc = train_one_epoch(train_loader, model, criterion,
                                                optimizer, device, epoch, args)
        val_loss, val_acc = validate(val_loader, model, criterion, device,
                                     epoch, args)
        lr_scheduler.step(epoch)

        # save check_point
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)
        save_dict = {
            'epoch': epoch,
            'backbone': args.net,
            'state_dict': model_without_dp.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
            'iteration': args.iteration
        }
        save_checkpoint(save_dict,
                        is_best,
                        filename=os.path.join(args.model_path,
                                              'epoch%s.pth.tar' % str(epoch)),
                        keep_all=False)

    print('Training from ep %d to ep %d finished' %
          (args.start_epoch, args.epochs))
    sys.exit(0)
Esempio n. 7
0
def main(args):
    model = CoCLR(args.net,
                  args.moco_dim,
                  args.moco_k,
                  args.moco_m,
                  args.moco_t,
                  topk=args.topk,
                  reverse=args.reverse)
    if args.reverse:
        print('[Warning] using RGB-Mining to help flow')
    else:
        print('[Warning] using Flow-Mining to help RGB')
    args.num_seq = 2

    args.img_path, args.model_path, args.exp_path = set_path(args)
    args.writer_train = SummaryWriter(logdir='runs')

    torch.cuda.set_device(args.gpu)
    model = model.cuda(args.gpu)
    params = []
    for name, param in model.named_parameters():
        params.append({'params': param})
    optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    args.iteration = 1

    ### data ###
    transform_train = get_transform('train', args)
    train_loader = get_dataloader(get_data(transform_train, args), args)
    transform_train_cuda = transforms.Compose([
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225],
                    channel=1)
    ])
    n_data = len(train_loader.dataset)

    ### restart training ###
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume,
                                    map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch'] + 1
            args.iteration = checkpoint['iteration']
            best_acc = checkpoint['best_acc']
            state_dict = checkpoint['state_dict']

            try:
                model_without_ddp.load_state_dict(state_dict)
            except:
                print('[WARNING] Non-Equal load for resuming training!')
                neq_load_customized(model_without_ddp,
                                    state_dict,
                                    verbose=True)

            print("=> load resumed checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            optimizer.load_state_dict(checkpoint['optimizer'])

    elif args.pretrain != ['random', 'random']:
        # first path: weights to be trained
        # second path: weights as the oracle, not trained
        if os.path.isfile(
                args.pretrain[1]):  # second network --> load as sampler
            checkpoint = torch.load(args.pretrain[1],
                                    map_location=torch.device('cpu'))
            second_dict = checkpoint['state_dict']
            new_dict = {}
            for k, v in second_dict.items():  # only take the encoder_q
                if 'encoder_q.' in k:
                    k = k.replace('encoder_q.', 'sampler.')
                    new_dict[k] = v
            second_dict = new_dict

            new_dict = {}  # remove queue, queue_ptr
            for k, v in second_dict.items():
                if 'queue' not in k:
                    new_dict[k] = v
            second_dict = new_dict
        if os.path.isfile(
                args.pretrain[0]):  # first network --> load both encoder q & k
            checkpoint = torch.load(args.pretrain[0],
                                    map_location=torch.device('cpu'))
            first_dict = checkpoint['state_dict']

            new_dict = {}  # remove queue, queue_ptr
            for k, v in first_dict.items():
                if 'queue' not in k:
                    new_dict[k] = v
            first_dict = new_dict

            # update both q and k with q
            new_dict = {}
            for k, v in first_dict.items():  # only take the encoder_q
                if 'encoder_q.' in k:
                    new_dict[k] = v
                    k = k.replace('encoder_q.', 'encoder_k.')
                    new_dict[k] = v
            first_dict = new_dict

        state_dict = {**first_dict, **second_dict}
        try:
            del state_dict['queue_label']  # always re-fill the queue
        except:
            pass
        neq_load_customized(model_without_ddp, state_dict, verbose=True)
    torch.backends.cudnn.benchmark = True
    best_acc = 0

    ### main loop ###
    for epoch in range(args.start_epoch, args.epochs):
        np.random.seed(epoch)
        random.seed(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        _, train_acc = train_one_epoch(train_loader, model, criterion,
                                       optimizer, transform_train_cuda, epoch,
                                       args)
        if (epoch % args.save_freq == 0) or (epoch == args.epochs - 1):
            is_best = train_acc > best_acc
            best_acc = max(train_acc, best_acc)
            state_dict = model.state_dict()
            save_dict = {
                'epoch': epoch,
                'state_dict': state_dict,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
                'iteration': args.iteration
            }
            save_checkpoint(save_dict,
                            is_best,
                            gap=args.save_freq,
                            filename=os.path.join(args.model_path,
                                                  'epoch%d.pth.tar' % epoch),
                            keep_all=False)

    print('Training from ep %d to ep %d finished' %
          (args.start_epoch, args.epochs))