def main():
    global args
    args = parser.parse_args()
    print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'num_gpus', args.num_gpus)
    # torch.cuda.device_count(): 返回可得到的GPU数量
    iteration_size = args.num_gpus // torch.cuda.device_count(
    )  # do multiple iterations
    assert iteration_size >= 1
    args.weight_decay = args.weight_decay * iteration_size  # will cancel out with lr
    args.lr = args.lr / iteration_size
    args.batch_size = args.batch_size // iteration_size
    # 对于只有一块GPU来说,参数没有改变
    print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'iteration_size', iteration_size)
    # 分布式处理部分
    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    print("=> creating model '{}'".format(args.arch))
    #    from resnext import resnext50_elastic
    #    from resnext_MulTask_11 import resnext50_elastic
    #    from resnext_MulTask_12 import resnext50_elastic
    # 没有elastic结构
    from resnext_MulTask_clothes_conv1_split import resnext50
    model = resnext50(num_classes=data_class)  # 注意类别: 两个任务,对应两个不同的类别数

    # 有elastic结构
    #    from resnext_MulTask_clothes_conv1_split import resnext50_elastic
    #    model = resnext50_elastic(num_classes=data_class)  # 注意类别: 两个任务,对应两个不同的类别数

    #    from resnext_MulTask_11 import resnext50
    #    from resnext import resnext50
    #    model = resnext50(num_classes=80)
    #    model = models.__dict__[args.arch](num_classes=80) # 加载模型

    # count number of parameters
    count = 0
    params = list()
    for n, p in model.named_parameters():
        if '.ups.' not in n:
            params.append(p)
            count += np.prod(p.size())
    print('Parameters:', count / 1000000.0, "( 百万)")  # 参数的数量

    # count flops
    model = add_flops_counting_methods(model)
    model.eval()
    image = torch.randn(1, 3, 224, 224)  # 图像归一化大小

    model.start_flops_count()
    model(image)[0].sum()  # 有改动
    model.stop_flops_count()
    print("GFLOPs",
          model.compute_average_flops_cost() / 1000000000.0,
          '( 十亿)')  # FLOP的个数

    # normal code
    model = torch.nn.DataParallel(model).cuda()
    # BCE损失函数
    criterion = nn.BCEWithLogitsLoss().cuda()
    # SGD优化策略
    optimizer = torch.optim.SGD([
        {
            'params': iter(params),
            'lr': args.lr
        },
    ],
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)  # 加载ckpt文件

            #            resume = ('module.fc.bias' in checkpoint['state_dict'] and
            #                      checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \
            #                     ('module.classifier.bias' in checkpoint['state_dict'] and
            #                      checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size())
            resume = False
            if resume:
                # True resume: resume training on MS-COCO  # 在MS-COCO上 评估?
                print()
                print("resume training on MS-COCO...")
                print("在MS-COCO上 评估...")
                print()
                model.load_state_dict(checkpoint['state_dict'], strict=False)
                optimizer.load_state_dict(
                    checkpoint['optimizer']
                ) if 'optimizer' in checkpoint else print('no optimizer found')
                args.start_epoch = checkpoint[
                    'epoch'] if 'epoch' in checkpoint else args.start_epoch
            else:
                # Fake resume: transfer from ImageNet # 从ImageNet——>MS-COCO 训练?
                print()
                print("transfer from ImageNet...")
                #                print("从ImageNet——>MS-COCO 训练...")
                print("从ImageNet——>服饰数据集 训练...")
                print()

                pretrained_dict = checkpoint['state_dict']
                model_dict = model.state_dict()  # 字典对象
                pretrained_dict = {
                    k: v
                    for k, v in pretrained_dict.items() if k in model_dict
                }
                model_dict.update(pretrained_dict)  #  对模型的参数进行更新
                model.load_state_dict(model_dict)


#                for n, p in list(checkpoint['state_dict'].items()):
#                    if 'classifier' in n or 'fc' in n:
#                        print(n, 'deleted from state_dict')
#                        del checkpoint['state_dict'][n]
#                model.load_state_dict(checkpoint['state_dict'], strict=False)

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume,
                checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown'))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # 提升一点训练速度,没什么额外开销,一般都会加
    # 仅限于非多尺度训练!否则效果更差!
    cudnn.benchmark = True

    #################################################################################################
    # 本体标签
    train_txt_path = os.path.join("train.txt")
    val_txt_path = os.path.join("test.txt")

    # 隐义标签
    train_sem_txt_path = os.path.join("train-sem.txt")
    val_sem_txt_path = os.path.join("test-sem.txt")

    # 服饰数据集正则化部分
    normTransform = transforms.Normalize(
        mean=[0.56391764, 0.43714827, 0.4107524],
        std=[0.22986116, 0.21178758, 0.20076773])

    # 训练集的数据变换
    trainTransform = transforms.Compose([
        transforms.RandomResizedCrop(224),  # 随机裁剪,
        transforms.RandomHorizontalFlip(),  # 随机水平翻转
        transforms.ToTensor(),
        normTransform  # 正则化
    ])
    # 测试集的数据变换
    valTransform = transforms.Compose([
        transforms.Resize((224, 224)),  # 调整图像大小
        transforms.ToTensor(),
        normTransform  # 正则化
    ])

    # 构建MyDataset实例
    train_data = MyDataset(txt_path=train_txt_path,
                           txt_sem_path=train_sem_txt_path,
                           transform=trainTransform,
                           sd='训练')  # 路径名,数据变换
    val_data = MyDataset(txt_path=val_txt_path,
                         txt_sem_path=val_sem_txt_path,
                         transform=valTransform,
                         sd='测试')
    print("---------------------")
    train_sampler = torch.utils.data.sampler.RandomSampler(train_data)  # 随机采样器
    # 构建DataLoder
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    #################################################################################################
    ### 继续调用程序
    if args.evaluate:
        validate_multi(val_loader, model, criterion)  # 在验证集上测试数据,return返回
        return

    for epoch in range(args.start_epoch, args.epochs):
        # 学习率调整
        coco_adjust_learning_rate(optimizer, epoch)
        # 模型训练
        train_multi(train_loader, model, criterion, optimizer, epoch,
                    iteration_size)
        print("***********************************************")
        print("模型训练完第 " + str(epoch + 1) + " 轮,下面进行验证集实验...")
        print("***********************************************")
        # evaluate on validation set
        # 模型验证
        validate_multi(val_loader, model, criterion)
        # 模型保存的位置
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            False,
            filename='5-clothes-result/' + args.arch + '_checkpoint.pth.tar')
Exemple #2
0
def main():
    global args, best_err1
    args = parser.parse_args()
    print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'num_gpus', args.num_gpus)
    iteration_size = args.num_gpus // torch.cuda.device_count()  # do multiple iterations
    assert iteration_size >= 1
    args.weight_decay = args.weight_decay * iteration_size  # will cancel out with lr
    args.lr = args.lr / iteration_size
    args.batch_size = args.batch_size // iteration_size
    print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size', args.batch_size, 'iteration_size', iteration_size)

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()

    # count number of parameters
    count = 0
    params = list()
    for n, p in model.named_parameters():
        if '.ups.' not in n:
            params.append(p)
            count += np.prod(p.size())
    print('Parameters:', count)

    # count flops
    model = add_flops_counting_methods(model)
    model.eval()
    image = torch.randn(1, 3, 224, 224)

    model.start_flops_count()
    model(image).sum()
    model.stop_flops_count()
    print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0)

    # normal code
    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # cuda warm up
    model = model.cuda()
    image = torch.randn(args.batch_size, 3, 224, 224)
    image_cuda = image.cuda()

    for i in range(3):
        start = time.time()
        model(image_cuda).sum().backward()  # Warmup CUDA memory allocator
        print(time.time() - start)

    # with torch.autograd.profiler.profile(use_cuda=True) as prof:
    #     start = time.time()
    #     model(image_cuda).sum().backward()
    #     print(time.time() - start)
    # prof.export_chrome_trace('trace_gpu')

    # import cProfile, pstats, io
    # pr = cProfile.Profile(time.perf_counter)
    # pr.enable()
    # model(image_cuda).sum().backward()
    # pr.disable()
    # s = io.StringIO()
    # sortby = 'cumulative'
    # ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
    # ps.print_stats()
    # print(s.getvalue())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD([{'params': iter(params), 'lr': args.lr},
                                 ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)

            model.load_state_dict(checkpoint['state_dict'], strict=False) if 'state_dict' in checkpoint else print('no state_dict found')
            optimizer.load_state_dict(checkpoint['optimizer']) if 'optimizer' in checkpoint else print('no optimizer found')
            args.start_epoch = checkpoint['epoch'] if 'epoch' in checkpoint else args.start_epoch
            best_err1 = checkpoint['best_err1'] if 'best_err' in checkpoint else best_err1

            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown'))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, iteration_size)

        # evaluate on validation set
        err1 = validate(val_loader, model, criterion)

        # remember best err@1 and save checkpoint
        is_best = err1 < best_err1
        best_err1 = min(err1, best_err1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_err1': best_err1,
            'optimizer': optimizer.state_dict(),
        }, is_best, filename=args.arch + '_checkpoint.pth.tar')
        print(str(float(best_err1)))
Exemple #3
0
def main():
    global best_err1, args

    iteration_size = args.num_gpus // args.world_size
    args.weight_decay = args.weight_decay * iteration_size  # will cancel out with lr
    args.lr = args.lr / iteration_size
    print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'iteration_size', iteration_size)
    args.distributed = args.world_size > 1
    args.gpu = 0
    if args.distributed:
        args.gpu = args.rank % torch.cuda.device_count()

    if args.distributed:
        torch.cuda.set_device(args.gpu)
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    # create model
    print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()

    # count number of parameters
    count = 0
    params = list()
    for n, p in model.named_parameters():
        if '.ups.' not in n:
            params.append(p)
            count += np.prod(p.size())
    print('Parameters:', count)

    # count flops
    model = add_flops_counting_methods(model)
    model.eval()
    image = torch.randn(1, 3, 224, 224)

    model.start_flops_count()
    model(image).sum()
    model.stop_flops_count()
    print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0)

    model = model.cuda()
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
        #shared param turns off bucketing in DDP, for lower latency runs this can improve perf
        model = DDP(model, shared_param=True)

    global model_params, master_params
    if args.fp16:
        model_params, master_params = prep_param_lists(model)
    else:
        master_params = list(model.parameters())

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD([
        {
            'params': iter(params),
            'lr': args.lr
        },
    ],
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)

            model.load_state_dict(
                checkpoint['state_dict'], strict=False
            ) if 'state_dict' in checkpoint else print('no state_dict found')
            optimizer.load_state_dict(
                checkpoint['optimizer']
            ) if 'optimizer' in checkpoint else print('no optimizer found')
            args.start_epoch = checkpoint[
                'epoch'] if 'epoch' in checkpoint else args.start_epoch
            best_err1 = checkpoint[
                'best_err1'] if 'best_err' in checkpoint else best_err1

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume,
                checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown'))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    crop_size = 224
    val_size = 256

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(crop_size),
            transforms.RandomHorizontalFlip(),
            # transforms.ToTensor(), Too slow
            # normalize,
        ]))
    val_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(val_size),
            transforms.CenterCrop(crop_size),
        ]))
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               collate_fn=fast_collate,
                                               drop_last=True)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(val_size),
            transforms.CenterCrop(crop_size),
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             collate_fn=fast_collate)
    # print(len(train_loader), len(val_loader))
    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch)
        print('allocated before', torch.cuda.memory_allocated())
        print('cached before', torch.cuda.memory_cached())
        gc.collect()
        torch.cuda.empty_cache()
        print('allocated after', torch.cuda.memory_allocated())
        print('cached after', torch.cuda.memory_cached())
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, iteration_size)

        #        # sync models on multiple GPUs
        #        if args.rank == 0:
        #            save_checkpoint({
        #                'epoch': epoch + 1,
        #                'arch': args.arch,
        #                'state_dict': model.state_dict(),
        #                'optimizer' : optimizer.state_dict(),
        #            }, False, 'temp.pth.tar')
        #        # barrier
        #        loss = torch.FloatTensor([args.rank]).cuda()
        #        reduced_loss = reduce_tensor(loss.data)
        #        print(loss.data, reduced_loss)
        #        if os.path.isfile('temp.pth.tar'):
        #            print("=> loading checkpoint '{}'".format('temp.pth.tar'))
        #            checkpoint = torch.load('temp.pth.tar', map_location = lambda storage, loc: storage.cuda(args.gpu))
        #            model.load_state_dict(checkpoint['state_dict'], strict=False)
        #            optimizer.load_state_dict(checkpoint['optimizer'])
        #            print("=> loaded checkpoint '{}' (epoch {})"
        #                  .format('temp.pth.tar', checkpoint['epoch']))
        #            assert checkpoint['epoch'] == epoch + 1

        # evaluate on validation set
        err1 = validate(val_loader, model, criterion)
        # remember best err@1 and save checkpoint
        if args.rank == 0:
            is_best = err1 < best_err1
            best_err1 = min(err1, best_err1)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_err1': best_err1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
            print(str(float(best_err1)))
Exemple #4
0
def main():
    global args
    args = parser.parse_args()
    print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'num_gpus', args.num_gpus)
    iteration_size = args.num_gpus // torch.cuda.device_count(
    )  # do multiple iterations
    assert iteration_size >= 1
    args.weight_decay = args.weight_decay * iteration_size  # will cancel out with lr
    args.lr = args.lr / iteration_size
    args.batch_size = args.batch_size // iteration_size
    print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'iteration_size', iteration_size)

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch](num_classes=80)

    # count number of parameters
    count = 0
    params = list()
    for n, p in model.named_parameters():
        if '.ups.' not in n:
            params.append(p)
            count += np.prod(p.size())
    print('Parameters:', count)

    # count flops
    model = add_flops_counting_methods(model)
    model.eval()
    image = torch.randn(1, 3, 224, 224)

    model.start_flops_count()
    model(image).sum()
    model.stop_flops_count()
    print("GFLOPs", model.compute_average_flops_cost() / 1000000000.0)

    # normal code
    model = torch.nn.DataParallel(model).cuda()

    criterion = nn.BCEWithLogitsLoss().cuda()
    optimizer = torch.optim.SGD([
        {
            'params': iter(params),
            'lr': args.lr
        },
    ],
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)

            resume = ('module.fc.bias' in checkpoint['state_dict'] and
                      checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \
                     ('module.classifier.bias' in checkpoint['state_dict'] and
                      checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size())
            if resume:
                # True resume: resume training on COCO
                model.load_state_dict(checkpoint['state_dict'], strict=False)
                optimizer.load_state_dict(
                    checkpoint['optimizer']
                ) if 'optimizer' in checkpoint else print('no optimizer found')
                args.start_epoch = checkpoint[
                    'epoch'] if 'epoch' in checkpoint else args.start_epoch
            else:
                # Fake resume: transfer from ImageNet
                for n, p in list(checkpoint['state_dict'].items()):
                    if 'classifier' in n or 'fc' in n:
                        print(n, 'deleted from state_dict')
                        del checkpoint['state_dict'][n]
                model.load_state_dict(checkpoint['state_dict'], strict=False)

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume,
                checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown'))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = CocoDetection(
        os.path.join(args.data, 'train2014'),
        os.path.join(args.data, 'annotations/instances_train2014.json'),
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    val_dataset = CocoDetection(
        os.path.join(args.data, 'val2014'),
        os.path.join(args.data, 'annotations/instances_val2014.json'),
        transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            normalize,
        ]))

    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate_multi(val_loader, model, criterion)
        return

    for epoch in range(args.start_epoch, args.epochs):
        coco_adjust_learning_rate(optimizer, epoch)

        # train for one epoch
        train_multi(train_loader, model, criterion, optimizer, epoch,
                    iteration_size)

        # evaluate on validation set
        validate_multi(val_loader, model, criterion)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            False,
            filename='coco_' + args.arch + '_checkpoint.pth.tar')
Exemple #5
0
def main():
    global args
    args = parser.parse_args()
    print('config: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'num_gpus', args.num_gpus)
    # torch.cuda.device_count(): 返回可得到的GPU数量
    iteration_size = args.num_gpus // torch.cuda.device_count(
    )  # do multiple iterations
    assert iteration_size >= 1
    args.weight_decay = args.weight_decay * iteration_size  # will cancel out with lr
    args.lr = args.lr / iteration_size
    args.batch_size = args.batch_size // iteration_size
    # 对于只有一块GPU来说,参数没有改变
    print('real: wd', args.weight_decay, 'lr', args.lr, 'batch_size',
          args.batch_size, 'iteration_size', iteration_size)
    # 分布式处理部分
    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    print("=> creating model '{}'".format(args.arch))

    #    from resnext import resnext50
    #    model = resnext50(num_classes=80)

    from resnext import resnext50_elastic
    model = resnext50_elastic(num_classes=80)

    #    from resnext_MulTask_12 import resnext50_elastic
    #    model = resnext50_elastic(num_classes=80)  # 模型 经过预训练   # opts["num_labels"] = 14

    # count number of parameters
    count = 0
    params = list()
    for n, p in model.named_parameters():
        if '.ups.' not in n:
            params.append(p)
            count += np.prod(p.size())
    print('Parameters:', count / 1000000.0)  # 参数的数量

    # count flops
    model = add_flops_counting_methods(model)
    model.eval()
    image = torch.randn(1, 3, 224, 224)

    model.start_flops_count()
    model(image).sum()
    model.stop_flops_count()
    print("GFLOPs",
          model.compute_average_flops_cost() / 1000000000.0)  # FLOP的个数

    # normal code
    model = torch.nn.DataParallel(model).cuda()
    # BCE损失函数
    criterion = nn.BCEWithLogitsLoss().cuda()
    # SGD优化策略
    optimizer = torch.optim.SGD([
        {
            'params': iter(params),
            'lr': args.lr
        },
    ],
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)  # 加载ckpt文件

            resume = ('module.fc.bias' in checkpoint['state_dict'] and
                      checkpoint['state_dict']['module.fc.bias'].size() == model.module.fc.bias.size()) or \
                     ('module.classifier.bias' in checkpoint['state_dict'] and
                      checkpoint['state_dict']['module.classifier.bias'].size() == model.module.classifier.bias.size())
            if resume:
                # True resume: resume training on MS-COCO  # 在MS-COCO上 评估?
                print()
                print("resume training on MS-COCO...")
                print("在MS-COCO上 评估...")
                print()
                model.load_state_dict(checkpoint['state_dict'], strict=False)
                optimizer.load_state_dict(
                    checkpoint['optimizer']
                ) if 'optimizer' in checkpoint else print('no optimizer found')
                args.start_epoch = checkpoint[
                    'epoch'] if 'epoch' in checkpoint else args.start_epoch
            else:
                # Fake resume: transfer from ImageNet # 从ImageNet——>MS-COCO 训练?
                print()
                print("transfer from ImageNet...")
                print("从ImageNet——>MS-COCO 训练...")
                print()
                for n, p in list(checkpoint['state_dict'].items()):
                    if 'classifier' in n or 'fc' in n:
                        print(n, 'deleted from state_dict')
                        del checkpoint['state_dict'][n]
                model.load_state_dict(checkpoint['state_dict'], strict=False)

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume,
                checkpoint['epoch'] if 'epoch' in checkpoint else 'unknown'))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # 提升一点训练速度,没什么额外开销,一般都会加
    # 仅限于非多尺度训练!否则效果更差!
    cudnn.benchmark = True

    # Data loading code
    # ms-coco正则化部分
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    # 训练集处理
    train_dataset = CocoDetection(
        os.path.join(args.data, 'train2014'),
        os.path.join(args.data, 'annotations/instances_train2014.json'),
        transforms.Compose([
            transforms.RandomResizedCrop(224),  # 随机裁剪
            transforms.RandomHorizontalFlip(),  # 随机水平翻转
            transforms.ToTensor(),
            normalize,  # 正则化
        ]))
    #    print("train_dataset: ", train_dataset)
    # 验证集处理
    val_dataset = CocoDetection(
        os.path.join(args.data, 'val2014'),
        os.path.join(args.data, 'annotations/instances_val2014.json'),
        transforms.Compose([
            transforms.Resize((224, 224)),  # 调整图像大小
            transforms.ToTensor(),
            normalize,  # 正则化
        ]))

    train_sampler = torch.utils.data.sampler.RandomSampler(
        train_dataset)  # 随机采样器

    # torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=default_collate,
    #                             pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None)
    # 1.dataset(Dataset): (image, label)形式,数据读取接口(比如torchvision.datasets.ImageFolder)或者自定义的数据接口的输出,
    #                     该输出是torch.utils.data.Dataset类的对象(或者继承自该类的自定义类的对象)。
    # 2.batch_size: 批训练数据量的大小,根据具体情况设置即可。(默认:1)
    # 3.shuffle: 打乱数据,一般在训练数据中会采用。(默认:False)
    # 4.sampler(Sampler, optional): 从数据集中提取样本的策略。如果指定,“shuffle”必须为false。一般默认即可。
    # 5.num_workers,这个参数必须大于等于0,其他大于0的数表示通过多个进程来导入数据,可以加快数据导入速度。(默认:0)
    # 6.pin_memory (bool, optional):数据加载器将把张量复制到CUDA内存中,然后返回它们。也就是一个数据拷贝的问题。
    # 7.drop_last (bool, optional): 如果数据集大小不能被批大小整除,则设置为“true”以除去最后一个未完成的批。如果“false”那么最后一批将更小。(默认:false)
    # 8.timeout(numeric, optional):设置数据读取超时,但超过这个时间还没读取到数据的话就会报错。(默认:0)
    # 训练集
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)
    # 验证集
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate_multi(val_loader, model, criterion)  # 在验证集上测试数据,return返回
        return

    for epoch in range(args.start_epoch, args.epochs):
        # 学习率调整
        coco_adjust_learning_rate(optimizer, epoch)
        # 模型训练
        train_multi(train_loader, model, criterion, optimizer, epoch,
                    iteration_size)
        print("***********************************************")
        print("模型训练完第 " + str(epoch + 1) + " 轮,下面进行验证集实验...")
        print("***********************************************")
        # evaluate on validation set
        # 模型验证
        validate_multi(val_loader, model, criterion)
        # 模型保存的位置
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            False,
            filename='3-MSCOCO--model-train-demo/' + 'coco_' + args.arch +
            '_checkpoint_' + str(epoch + 1) + '.pth.tar')