コード例 #1
0
ファイル: train.py プロジェクト: tbfly/EfficientDet.Pytorch
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            # args.rank = int(os.environ["RANK"])
            args.rank = 1
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    checkpoint = []
    if (args.resume is not None):
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
        params = checkpoint['parser']
        args.num_class = params.num_class
        args.network = params.network
        args.start_epoch = params.start_epoch + 1
        del params

    model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class'],
                         gpu=args.gpu)
    if (args.resume is not None):
        model.load_state_dict(checkpoint['state_dict'])
    del checkpoint
    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu], find_unused_parameters=True)
            print('Run with DistributedDataParallel with divice_ids....')
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
            print('Run with DistributedDataParallel without device_ids....')
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        print('Run with DataParallel ....')
        model = torch.nn.DataParallel(model).cuda()

    # Training dataset
    train_dataset = []
    if (args.dataset == 'VOC'):
        #         train_dataset = VOCDetection(root=args.dataset_root,
        #                                      transform=get_augumentation(phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size']))
        train_dataset = VOCDetection(root=args.dataset_root,
                                     transform=transforms.Compose([
                                         Normalizer(),
                                         Augmenter(),
                                         Resizer()
                                     ]))

    elif (args.dataset == 'COCO'):
        train_dataset = CocoDataset(
            root_dir=args.dataset_root,
            set_name='train2017',
            transform=get_augumentation(
                phase='train',
                width=EFFICIENTDET[args.network]['input_size'],
                height=EFFICIENTDET[args.network]['input_size']))


#     train_loader = DataLoader(train_dataset,
#                                   batch_size=args.batch_size,
#                                   num_workers=args.workers,
#                                   shuffle=True,
#                                   collate_fn=detection_collate,
#                                   pin_memory=True)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.workers,
                              shuffle=True,
                              collate_fn=collater,
                              pin_memory=True)
    # define loss function (criterion) , optimizer, scheduler
    optimizer = optim.AdamW(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     patience=3,
                                                     verbose=True)
    cudnn.benchmark = True

    for epoch in range(args.start_epoch, args.num_epoch):
        train(train_loader, model, scheduler, optimizer, epoch, args)
        state = {
            'epoch': epoch,
            'parser': args,
            'state_dict': get_state_dict(model)
        }
        torch.save(
            state,
            './weights/checkpoint_{}_{}_{}.pth'.format(args.dataset,
                                                       args.network, epoch))
コード例 #2
0
def main(args=None):

    parser = argparse.ArgumentParser(
        description="Simple training script for training a RetinaNet network.")

    parser.add_argument(
        "--dataset", help="Dataset type, must be one of csv or coco or ycb.")
    parser.add_argument("--path", help="Path to dataset directory")
    parser.add_argument(
        "--csv_train",
        help="Path to file containing training annotations (see readme)")
    parser.add_argument("--csv_classes",
                        help="Path to file containing class list (see readme)")
    parser.add_argument("--csv_val",
                        help="Path to file containing validation annotations "
                        "(optional, see readme)")

    parser.add_argument(
        "--depth",
        help="Resnet depth, must be one of 18, 34, 50, 101, 152",
        type=int,
        default=50)
    parser.add_argument("--epochs",
                        help="Number of epochs",
                        type=int,
                        default=100)
    parser.add_argument("--evaluate_every", default=20, type=int)
    parser.add_argument("--print_every", default=20, type=int)
    parser.add_argument('--distributed',
                        action="store_true",
                        help='Run model in distributed mode with DataParallel')

    parser = parser.parse_args(args)

    # Create the data loaders
    if parser.dataset == "coco":

        if parser.path is None:
            raise ValueError(
                "Must provide --path when training on non-CSV datasets")

        dataset_train = CocoDataset(parser.path,
                                    ann_file="instances_train2014.json",
                                    set_name="train2014",
                                    transform=transforms.Compose([
                                        Normalizer(),
                                        Augmenter(),
                                        Resizer(min_side=512, max_side=512)
                                    ]))
        dataset_val = CocoDataset(parser.path,
                                  ann_file="instances_val2014.cars.json",
                                  set_name="val2014",
                                  transform=transforms.Compose(
                                      [Normalizer(), Resizer()]))

    elif parser.dataset == "ycb":

        dataset_train = YCBDataset(parser.path,
                                   "image_sets/train.txt",
                                   transform=transforms.Compose([
                                       Normalizer(),
                                       Augmenter(),
                                       Resizer(min_side=512, max_side=512)
                                   ]),
                                   train=True)
        dataset_val = YCBDataset(parser.path,
                                 "image_sets/val.txt",
                                 transform=transforms.Compose(
                                     [Normalizer(), Resizer()]),
                                 train=False)

    elif parser.dataset == "csv":

        if parser.csv_train is None:
            raise ValueError("Must provide --csv_train when training on COCO,")

        if parser.csv_classes is None:
            raise ValueError(
                "Must provide --csv_classes when training on COCO,")

        dataset_train = CSVDataset(train_file=parser.csv_train,
                                   class_list=parser.csv_classes,
                                   transform=transforms.Compose(
                                       [Normalizer(),
                                        Augmenter(),
                                        Resizer()]))

        if parser.csv_val is None:
            dataset_val = None
            print("No validation annotations provided.")
        else:
            dataset_val = CSVDataset(train_file=parser.csv_val,
                                     class_list=parser.csv_classes,
                                     transform=transforms.Compose(
                                         [Normalizer(),
                                          Resizer()]))

    else:
        raise ValueError(
            "Dataset type not understood (must be csv or coco), exiting.")

    sampler = AspectRatioBasedSampler(dataset_train,
                                      batch_size=12,
                                      drop_last=False)
    dataloader_train = DataLoader(dataset_train,
                                  num_workers=8,
                                  collate_fn=collater,
                                  batch_sampler=sampler)

    if dataset_val is not None:
        sampler_val = AspectRatioBasedSampler(dataset_val,
                                              batch_size=1,
                                              drop_last=False)
        dataloader_val = DataLoader(dataset_val,
                                    num_workers=4,
                                    collate_fn=collater,
                                    batch_sampler=sampler_val)

    # Create the model
    if parser.depth == 18:
        retinanet = model.resnet18(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 34:
        retinanet = model.resnet34(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 50:
        retinanet = model.resnet50(num_classes=dataset_train.num_classes(),
                                   pretrained=True)
    elif parser.depth == 101:
        retinanet = model.resnet101(num_classes=dataset_train.num_classes(),
                                    pretrained=True)
    elif parser.depth == 152:
        retinanet = model.resnet152(num_classes=dataset_train.num_classes(),
                                    pretrained=True)
    else:
        raise ValueError(
            "Unsupported model depth, must be one of 18, 34, 50, 101, 152")

    print("CUDA available: {}".format(torch.cuda.is_available()))
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    retinanet = retinanet.to(device)

    if parser.distributed:
        retinanet = torch.nn.DataParallel(retinanet)

    optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     patience=3,
                                                     verbose=True)

    loss_hist = collections.deque(maxlen=500)

    print("Num training images: {}".format(len(dataset_train)))

    best_mean_avg_prec = 0.0

    for epoch_num in range(parser.epochs):

        retinanet.train()
        retinanet.freeze_bn()

        epoch_loss = []

        for iter_num, data in enumerate(dataloader_train):
            try:
                optimizer.zero_grad()

                classification_loss, regression_loss = retinanet(
                    [data["img"].to(device).float(), data["annot"]])

                classification_loss = classification_loss.mean()
                regression_loss = regression_loss.mean()

                loss = classification_loss + regression_loss

                if bool(loss == 0):
                    continue

                loss.backward()

                torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)

                optimizer.step()

                loss_hist.append(float(loss.item()))
                epoch_loss.append(float(loss.item()))

                if parser.print_every % iter_num == 0:
                    print("Epoch: {} | Iteration: {}/{} | "
                          "Classification loss: {:1.5f} | "
                          "Regression loss: {:1.5f} | "
                          "Running loss: {:1.5f}".format(
                              epoch_num, iter_num, len(dataloader_train),
                              float(classification_loss),
                              float(regression_loss), np.mean(loss_hist)))

                del classification_loss
                del regression_loss
            except Exception as e:
                print(e)
                continue

        if ((epoch_num + 1) % parser.evaluate_every
                == 0) or epoch_num + 1 == parser.epochs:

            mAP = 0.0

            if parser.dataset == "coco":

                print("Evaluating dataset")
                mAP = coco_eval.evaluate_coco(dataset_val, retinanet)

            else:
                print("Evaluating dataset")
                AP = eval.evaluate(dataset_val, retinanet)
                mAP = np.asarray([x[0] for x in AP.values()]).mean()
                print("Val set mAP: ", mAP)

            if mAP > best_mean_avg_prec:
                best_mean_avg_prec = mAP
                torch.save(
                    retinanet.state_dict(),
                    "{}_retinanet_best_mean_ap_{}.pt".format(
                        parser.dataset, epoch_num))

        scheduler.step(np.mean(epoch_loss))

    retinanet.eval()

    torch.save(retinanet.state_dict(), "retinanet_model_final.pt")
コード例 #3
0
ファイル: main.py プロジェクト: peternara/EfficientDet-1
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu
    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            # args.rank = int(os.environ["RANK"])
            args.rank = 1
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(
            backend=args.dist_backend,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=args.rank)

    # args.num_class = train_dataset.num_classes()

    print('dataset:', args.dataset)
    print('network:', args.network)
    print('num_epoch:', args.num_epoch)
    print('batch_size:', args.batch_size)
    print('lr_choice:', args.lr_choice)
    print('lr:', args.lr)
    print('lr_fn:', args.lr_fn)
    print('image_size:', args.image_size)
    print('workers:', args.workers)
    print('num_class:', args.num_class)
    print('save_folder:', args.save_folder)
    print('limit:', args.limit)

    if args.dataset == 'h5':
        train_dataset = H5CoCoDataset('{}/train_small.hdf5'.format(args.dataset_root), 'train_small')
        valid_dataset = H5CoCoDataset('{}/test.hdf5'.format(args.dataset_root), 'test')
    else:
        train_dataset = CocoDataset(args.dataset_root, set_name='train_small',
                                # transform=transforms.Compose([Normalizer(), Augmenter(), Resizer(args.image_size)]),
                                transform=get_augumentation('train'),
                                limit_len=args.limit[0])
        valid_dataset = CocoDataset(args.dataset_root, set_name='test',
                              # transform=transforms.Compose([Normalizer(), Resizer(args.image_size)]),
                              transform=get_augumentation('test'),
                              limit_len=args.limit[1])

    print('train_dataset:', len(train_dataset))
    print('valid_dataset:', len(valid_dataset))

    steps_pre_epoch = len(train_dataset) // args.batch_size
    print('steps_pre_epoch:', steps_pre_epoch)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.workers,
                              shuffle=True,
                              collate_fn=detection_collate,
                              pin_memory=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=1,
                              num_workers=args.workers,
                              shuffle=False,
                              collate_fn=detection_collate,
                              pin_memory=True)

    checkpoint = []
    if(args.resume is not None):
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
        params = checkpoint['parser']
        args.num_class = params.num_class
        args.network = params.network
        args.start_epoch = checkpoint['epoch'] + 1
        del params

    model = EfficientDet(num_classes=args.num_class,
                         network=args.network,
                         W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                         D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                         D_class=EFFICIENTDET[args.network]['D_class']
                         )
    
    if(args.resume is not None):
        model.load_state_dict(checkpoint['state_dict'])
    
    del checkpoint

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
            print('Run with DistributedDataParallel with divice_ids....')
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
            print('Run with DistributedDataParallel without device_ids....')
    elif args.gpu is not None:
        # print('using gpu:', args.gpu)
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = model.cpu()
        # print('Run with DataParallel ....')
        model = torch.nn.DataParallel(model).cuda()

    if args.lr_choice == 'lr_fn':
        lr_now = float(args.lr_fn['LR_START'])
    elif args.lr_choice == 'lr_scheduler':
        lr_now = args.lr

    optimizer = optim.Adam(model.parameters(), lr=lr_now)
    # optimizer = optim.AdamW(model.parameters(), lr=args.lr)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.1, verbose=True)
    cudnn.benchmark = True
    
    iteration_loss_path = 'iteration_loss.csv'
    if os.path.isfile(iteration_loss_path):
        os.remove(iteration_loss_path)
    
    epoch_loss_path = 'epoch_loss.csv'
    if os.path.isfile(epoch_loss_path):
        os.remove(epoch_loss_path)
    
    eval_train_path = 'eval_train_result.csv'
    if os.path.isfile(eval_train_path):
        os.remove(eval_train_path)

    eval_val_path = 'eval_val_result.csv'
    if os.path.isfile(eval_val_path):
        os.remove(eval_val_path)

    USE_KAGGLE = True if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', False) else False
    if USE_KAGGLE:
        iteration_loss_path = '/kaggle/working/' + iteration_loss_path
        epoch_loss_path = '/kaggle/working/' + epoch_loss_path
        eval_val_path = '/kaggle/working/' + eval_val_path
        eval_train_path = '/kaggle/working/' + eval_train_path
    
    with open(epoch_loss_path, 'a+') as epoch_loss_file, \
         open(iteration_loss_path, 'a+') as iteration_loss_file, \
         open(eval_train_path, 'a+') as eval_train_file, \
         open(eval_val_path, 'a+') as eval_val_file:

        epoch_loss_file.write('epoch_num,mean_epoch_loss\n')
        iteration_loss_file.write('epoch_num,iteration,classification_loss,regression_loss,iteration_loss\n')
        eval_train_file.write('epoch_num,map50\n')
        eval_val_file.write('epoch_num,map50\n')

        for epoch in range(args.start_epoch, args.num_epoch):
            train(train_loader, model, scheduler, optimizer, epoch, args, epoch_loss_file, iteration_loss_file, steps_pre_epoch)

            # test
            _model = model.module
            _model.eval()
            _model.is_training = False
            with torch.no_grad():
                if args.dataset != 'show':
                    evaluate_coco(train_dataset, _model, args.dataset, epoch, eval_train_file)
                evaluate_coco(valid_dataset, _model, args.dataset, epoch, eval_val_file)
コード例 #4
0
                        help='Checkpoint state_dict file to resume training from')
    args = parser.parse_args()

    if(args.weight is not None):
        resume_path = str(args.weight)
        print("Loading checkpoint: {} ...".format(resume_path))
        checkpoint = torch.load(
            args.weight, map_location=lambda storage, loc: storage)
        params = checkpoint['parser']
        args.num_class = params.num_class
        args.network = params.network
        model = EfficientDet(
            num_classes=args.num_class,
            network=args.network,
            W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
            D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
            D_class=EFFICIENTDET[args.network]['D_class'],
            is_training=False,
            threshold=args.threshold,
            iou_threshold=args.iou_threshold)
        model.load_state_dict(checkpoint['state_dict'])
    model = model.cuda()
    if(args.dataset == 'VOC'):
        valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'test')],
                                     transform=transforms.Compose([Normalizer(), Resizer()]))
        evaluate(valid_dataset, model)
    else:
        valid_dataset = CocoDataset(root_dir=args.dataset_root, set_name='val2017',
                                    transform=transforms.Compose([Normalizer(), Resizer()]))
        evaluate_coco(valid_dataset, model)
コード例 #5
0
checkpoint = []
if(args.resume is not None):
    resume_path = str(args.resume)
    print("Loading checkpoint: {} ...".format(resume_path))
    checkpoint = torch.load(
        args.resume, map_location=lambda storage, loc: storage)
    args.num_class = checkpoint['num_class']
    args.network = checkpoint['network']

train_dataset = []
if(args.dataset == 'VOC'):
    train_dataset = VOCDetection(root=args.dataset_root,
                                 transform=get_augumentation(phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size']))

elif(args.dataset == 'COCO'):
    train_dataset = CocoDataset(root_dir=args.dataset_root, set_name='train2017', transform=get_augumentation(
        phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size']))

train_dataloader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_worker,
                              shuffle=True,
                              collate_fn=detection_collate,
                              pin_memory=True)

model = EfficientDet(num_classes=args.num_classes,
                     network=args.network,
                     W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
                     D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
                     D_class=EFFICIENTDET[args.network]['D_class'],
                     )
if(args.resume is not None):