Esempio n. 1
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    #cab
    writer = SummaryWriter("runs/" + args.tb_name)

    best_value = 0

    print("Start training, best_value is " + str(best_value))
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        #cab
        for k, v in train_stats.items():
            if isinstance(v, float):
                writer.add_scalar(f'train_{k}', v, epoch)

        new_value = 0
        for k, v in test_stats.items():
            if (isinstance(v, float)):
                writer.add_scalar(f'test_{k}', v, epoch)
            if (k == "coco_eval_bbox"):
                new_value = v[0]
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[0], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ]',
                    v[1], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ]',
                    v[2], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[3], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[4], epoch)
                writer.add_scalar(
                    'Bbox Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[5], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]',
                    v[6], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]',
                    v[7], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[8], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[9], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[10], epoch)
                writer.add_scalar(
                    'Bbox Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[11], epoch)

            if (k == "coco_eval_masks"):
                new_value = v[0]
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[0], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ]',
                    v[1], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ]',
                    v[2], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[3], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[4], epoch)
                writer.add_scalar(
                    'Mask Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[5], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ]',
                    v[6], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ]',
                    v[7], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]',
                    v[8], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
                    v[9], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
                    v[10], epoch)
                writer.add_scalar(
                    'Mask Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
                    v[11], epoch)

        print("Epoch finished, best_value is " + str(best_value))

        save_pth = False
        if best_value < new_value:
            best_value = new_value
            save_pth = True

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')

            if save_pth:
                checkpoint_paths.append(output_dir / f'best.pth')
                bestLog = open(output_dir / 'best_log.txt', 'w+')
                bestLog.write(f'Saved model at epoch {epoch:04}\n')

            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        #/cab

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 2
0
File: main.py Progetto: bjuncek/detr
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print("number of params:", n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set="train", args=args)
    dataset_val = build_dataset(image_set="val", args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(
        dataset_train,
        batch_sampler=batch_sampler_train,
        collate_fn=utils.collate_fn,
        num_workers=args.num_workers,
    )
    data_loader_val = DataLoader(
        dataset_val,
        args.batch_size if args.batch_size < 4 else 4,
        sampler=sampler_val,
        drop_last=False,
        collate_fn=utils.collate_fn,
        num_workers=args.num_workers,
    )

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    elif args.dataset_file in ["cmdd", "cmdc", "wider"]:
        base_ds = None
    elif args.dataset_file == "MOT17":
        base_ds = dataset_val
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location="cpu")
        model_without_ddp.detr.load_state_dict(checkpoint["model"])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith("https"):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location="cpu",
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location="cpu")

        # NOTE: this is Bruno's hack to load stuff in
        model_dict = model_without_ddp.state_dict()
        pretrained_dict = checkpoint["model"]
        # hack for adding query stuff
        if ("query_embed.query_embed.weight" in model_dict.keys()
                and "query_embed.weight" in pretrained_dict.keys()):
            pretrained_dict[
                "query_embed.query_embed.weight"] = pretrained_dict[
                    "query_embed.weight"]
        # 1. filter out unnecessary keys
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict
        }
        # if finetuning skip the linear stuff
        if args.finetune:
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items()
                if k not in ["class_embed.weight", "class_embed.bias"]
            }
        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)
        # 3. load new state dict
        model_without_ddp.load_state_dict(model_dict)

        if (not args.eval and not args.load_model_only
                and "optimizer" in checkpoint and "lr_scheduler" in checkpoint
                and "epoch" in checkpoint):
            optimizer.load_state_dict(checkpoint["optimizer"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
            args.start_epoch = checkpoint["epoch"] + 1

    if args.eval:
        if args.test and args.dataset_file == "wider":
            if args.resume:
                s = args.resume.split("/")[:-1]
                output_dir = "/" + os.path.join(*s)
            else:
                output_dir = args.output_dir
            print("SAVING TEST WIDER TO ", output_dir)
            test_wider(
                model,
                criterion,
                postprocessors,
                dataset_val,
                data_loader_val,
                device,
                output_dir,
            )
            return
        test_stats, coco_evaluator = evaluate(
            model,
            criterion,
            postprocessors,
            data_loader_val,
            base_ds,
            device,
            args.output_dir,
            dset_file=args.dataset_file,
        )
        if args.output_dir and coco_evaluator is not None:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model,
            criterion,
            data_loader_train,
            optimizer,
            device,
            epoch,
            args.clip_max_norm,
        )
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / "checkpoint.pth"]
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f"checkpoint{epoch:04}.pth")
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        "model": model_without_ddp.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "lr_scheduler": lr_scheduler.state_dict(),
                        "epoch": epoch,
                        "args": args,
                    },
                    checkpoint_path,
                )

        test_stats, coco_evaluator = evaluate(
            model,
            criterion,
            postprocessors,
            data_loader_val,
            base_ds,
            device,
            args.output_dir,
            dset_file=args.dataset_file,
        )

        log_stats = {
            **{f"train_{k}": v
               for k, v in train_stats.items()},
            **{f"test_{k}": v
               for k, v in test_stats.items()},
            "epoch": epoch,
            "n_parameters": n_parameters,
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / "eval").mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ["latest.pth"]
                    if epoch % 50 == 0:
                        filenames.append(f"{epoch:03}.pth")
                    for name in filenames:
                        torch.save(
                            coco_evaluator.coco_eval["bbox"].eval,
                            output_dir / "eval" / name,
                        )

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print("Training time {}".format(total_time_str))
Esempio n. 3
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    # Save our Wandb metadata
    if not args.no_wb:
        wandb.init(entity='dl-project',
                   project='dl-final-project',
                   name=args.wb_name,
                   notes=args.wb_notes,
                   reinit=True)
    wandb.config.epochs = args.epochs

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    # visualize_video(model, postprocessors)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of trainable params:', n_parameters)
    wandb.config.n_parameters = n_parameters
    wandb.config.n_trainable_parameters = n_parameters  # better name

    # Log total # of model parameters (including frozen) to W&B
    n_total_parameters = sum(p.numel() for p in model.parameters())
    print('total number of parameters:', n_total_parameters)
    wandb.config.n_total_parameters = n_total_parameters

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    # For visualization we want the raw images without any normalization or random resizing
    dataset_val_without_resize = CocoDetection(
        "data/coco/val2017",
        annFile="data/coco/annotations/instances_val2017.json",
        transforms=T.Compose([T.ToTensor()]))

    # Save metadata about training + val datasets and batch size
    wandb.config.len_dataset_train = len(dataset_train)
    wandb.config.len_dataset_val = len(dataset_val)
    wandb.config.batch_size = args.batch_size

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]

    # Not sure if we should save all hyperparameters in wandb.config?
    # just start with a few important ones
    wandb.config.lr = args.lr
    wandb.config.lr_backbone = args.lr_backbone

    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            # print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
            args.override_resumed_lr_drop = True
            if args.override_resumed_lr_drop:
                print(
                    'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.'
                )
                lr_scheduler.step_size = args.lr_drop
                lr_scheduler.base_lrs = list(
                    map(lambda group: group['initial_lr'],
                        optimizer.param_groups))
            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1
        # check the resumed model
        if not args.eval:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

    if args.eval:

        print("Generating visualizations...")
        visualize_bbox(model, postprocessors, data_loader_val, device,
                       dataset_val_without_resize)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_file_for_wb = str(
                output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth')
            checkpoint_paths = [
                output_dir / 'checkpoint.pth', checkpoint_file_for_wb
            ]

            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

            # Save model checkpoint to W&B
            wandb.save(checkpoint_file_for_wb)

        # Generate visualizations for fixed(?) set of images every epoch
        print("Generating visualizations...")
        visualize_bbox(model, postprocessors, data_loader_val, device,
                       dataset_val_without_resize)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        # Save the COCO metrics properly
        metric_name = [
            "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10",
            "AR@100", "ARs", "ARm", "ARl"
        ]
        for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]):
            log_stats[metric_name[i]] = metric_val

        if not args.no_wb:
            wandb.log(log_stats)
        print("train_loss: ", log_stats['train_loss'])

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
            wandb.save(str(output_dir / "log.txt"))

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth'
                    eval_path_for_wb = str(output_dir / "eval" /
                                           eval_filename_for_wb)
                    filenames = ['latest.pth', eval_filename_for_wb]
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

                    # TODO not sure if this file will end up being too big
                    # I think it's the COCO precision/recall metrics
                    # in some format...
                    # let's track it just in case to start!
                    wandb.save(eval_path_for_wb)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 4
0
def main(args):
    if args.gpu_id >= 0:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)
    if args.neptune:
        # Connect your script to Neptune
        import neptune
        # your NEPTUNE_API_TOKEN should be add to ~./bashrc to run this file
        neptune.init(project_qualified_name='detectwaste/detr')
        if args.dilation:
            exp_name = f"{args.dataset_file}_{args.backbone}_DC"
        else:
            exp_name = f"{args.dataset_file}_{args.backbone}"
        neptune.create_experiment(name=exp_name)
    else:
        neptune = None

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    if args.optimizer == 'LaProp':
        optimizer = LaProp(param_dicts,
                           lr=args.lr,
                           weight_decay=args.weight_decay)
    elif args.optimizer == 'AdamW':
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    else:
        sys.exit(f'Choosen optimizer {args.optimizer} is not available.')
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    dataset_test = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
        sampler_test = DistributedSampler(dataset_test, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
        sampler_test = torch.utils.data.SequentialSampler(dataset_test)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)
    data_loader_test = DataLoader(dataset_test,
                                  args.batch_size,
                                  sampler=sampler_test,
                                  drop_last=False,
                                  collate_fn=utils.collate_fn,
                                  num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        if args.dataset_file in waste_datasets_list and args.start_epoch == 0:
            # For waste detection datasets - we must cut classification head
            del checkpoint["model"]["class_embed.weight"]
            del checkpoint["model"]["class_embed.bias"]
            del checkpoint["model"]["query_embed.weight"]
            model_without_ddp.load_state_dict(checkpoint['model'],
                                              strict=False)
        elif args.dataset_file == 'coco':
            model_without_ddp.load_state_dict(checkpoint['model'])
        else:
            model_without_ddp.load_state_dict(checkpoint['model'],
                                              strict=False)
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_test, base_ds,
                                              device, args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm, neptune)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir, neptune)
        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                # sent validation mAP to neptune
                if "bbox" in coco_evaluator.coco_eval:
                    if args.neptune:
                        neptune.log_metric(
                            'valid/bbox [email protected]:0.95',
                            coco_evaluator.coco_eval['bbox'].stats[0])
                        neptune.log_metric(
                            'valid/bbox [email protected]',
                            coco_evaluator.coco_eval['bbox'].stats[1])
                        if args.masks:
                            neptune.log_metric(
                                'valid/segm [email protected]',
                                coco_evaluator.coco_eval['segm'].stats[1])
                            neptune.log_metric(
                                'valid/segm [email protected]:0.95',
                                coco_evaluator.coco_eval['segm'].stats[0])
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 5
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # IPython.embed()
    # IPython.embed()
    # os.system("sudo chmod -R 777 /home/shuxuang/.cache/")
    model, criterion, postprocessors = build_model(
        args)  # use the same model as detr paper on coco
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)
    # modify the dataset from coco to nvdata
    # home_dir = os.environ["HOME"]
    # dataset_train_ = build_nvdataset(dataset_root=[
    #                                     os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'),
    #                                     os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                                 mode='train')
    # dataset_val = build_nvdataset(dataset_root=[
    #                                 os.path.join(os.environ["HOME"],'datasets/test'),
    #                                 os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')],
    #                               mode='test')
    # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))

    dataset_train = build_nvdataset(
        dataset_root=[args.dataset_root_sql, args.dataset_root_img],
        mode='train',
        camera=args.camera)
    dataset_val = build_nvdataset(
        dataset_root=[args.dataset_root_test, args.dataset_root_test],
        mode='test',
        camera=args.camera)
    if args.root_indices is not None:
        indices_50k = np.load(os.path.join(args.root_indices))
        # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy'))
        dataset_train = Subset(dataset_train, indices_50k)
    # IPython.embed()
    print("Train samples: %d" % (len(dataset_train)))

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    # if args.dataset_file == "coco_panoptic":
    #     # We also evaluate AP during panoptic training, on original coco DS
    #     coco_val = datasets.coco.build("val", args)
    #     base_ds = get_coco_api_from_dataset(coco_val)
    # elif args.dataset_file == "nvdata":
    #     coco_val = datasets.coco.build("val", args)
    #     base_ds = get_coco_api_from_dataset(coco_val)
    # else:
    #     base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    # if args.eval:
    #     test_stats, coco_evaluator = evaluate_nvdata(model, criterion, postprocessors,
    #                                           data_loader_val, base_ds, device, args.output_dir)
    #     if args.output_dir:
    #         utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
    #     return

    # if args.eval:
    #     evaluate(model, dataset_val, postprocessors, device)

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 50 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # test_stats, coco_evaluator = evaluate_nvdata(
        #     model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
        # )

        # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
        #              **{f'test_{k}': v for k, v in test_stats.items()},
        #              'epoch': epoch,
        #              'n_parameters': n_parameters}

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            # if coco_evaluator is not None:
            #     (output_dir / 'eval').mkdir(exist_ok=True)
            #     if "bbox" in coco_evaluator.coco_eval:
            #         filenames = ['latest.pth']
            #         if epoch % 50 == 0:
            #             filenames.append(f'{epoch:03}.pth')
            #         for name in filenames:
            #             torch.save(coco_evaluator.coco_eval["bbox"].eval,
            #                        output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 6
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    device = torch.device(args.device)

    # Fix the seed for reproducibility.
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True)
    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)

    # Load from pretrained DETR model.
    assert args.num_queries == 100, args.num_queries
    assert args.enc_layers == 6 and args.dec_layers == 6
    assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone
    if args.backbone == 'resnet50':
        pretrain_model = './data/detr_coco/detr-r50-e632da11.pth'
    elif args.backbone == 'resnet101':
        pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth'
    else:
        pretrain_model = None
    if pretrain_model is not None:
        pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model']
        my_model_dict = model_without_ddp.state_dict()
        pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict}
        my_model_dict.update(pretrain_dict)
        model_without_ddp.load_state_dict(my_model_dict)

    output_dir = Path(args.output_dir)
    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch,
            args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 10 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 7
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    output_dir = Path(args.output_dir)

    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.eval:
        dataset_val = build_dataset(image_set=args.dataset, args=args)

        if args.distributed:
            sampler_val = DistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)

        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
    else:
        dataset_train = build_dataset(image_set='train', args=args)
        dataset_val = build_dataset(image_set='val', args=args)

        if args.distributed:
            sampler_train = DistributedSampler(dataset_train)
            sampler_val = DistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_train = torch.utils.data.SequentialSampler(dataset_train)
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)

        batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True)

        data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers)
        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)

    base_ds = get_coco_api_from_dataset(dataset_val)



    if args.resume and args.frozen_weights:
        assert False
    elif args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True)
            new_state_dict = {}
            for k in checkpoint['model']:
                if ("class_embed" in k) or ("bbox_embed" in k) or ("query_embed" in k):
                    continue
                if  ("input_proj" in k) and args.layer1_num != 3:
                    continue
                new_state_dict[k] = checkpoint['model'][k]
            
            # Compare load model and current model
            current_param = [n for n,p in model_without_ddp.named_parameters()]
            current_buffer = [n for n,p in model_without_ddp.named_buffers()]
            load_param = new_state_dict.keys()
            for p in load_param:
                if p not in current_param and p not in current_buffer:
                    print(p, 'NOT appear in current model.  ')
            for p in current_param:
                if p not in load_param:
                    print(p, 'NEW parameter.  ')
            model_without_ddp.load_state_dict(new_state_dict, strict=False)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')

            # this is to compromise old implementation 
            new_state_dict = {}
            for k in checkpoint['model']:
                if "bbox_embed" in k:
                    print("bbox_embed from OLD implementation has been replaced with lines_embed")
                    new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] 
                else:
                    new_state_dict[k] = checkpoint['model'][k]

            # compare resume model and current model
            current_param = [n for n,p in model_without_ddp.named_parameters()]
            current_buffer = [n for n,p in model_without_ddp.named_buffers()]
            load_param = new_state_dict.keys()
            #for p in load_param:
                #if p not in current_param and p not in current_buffer:
                    #print(p, 'not been loaded to current model. Strict == False?')
            for p in current_param:
                if p not in load_param:
                    print(p, 'is a new parameter. Not found from load dict.')
            
            # load model
            model_without_ddp.load_state_dict(new_state_dict)

            # load optimizer
            if not args.no_opt and not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer'])
                checkpoint['lr_scheduler']['step_size'] = args.lr_drop  # change the lr_drop epoch
                lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
                args.start_epoch = checkpoint['epoch'] + 1
    elif args.frozen_weights:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        new_state_dict = {}
        for k in checkpoint['model']:
            if "bbox_embed" in k:
                new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] 
            else:
                new_state_dict[k] = checkpoint['model'][k]

        model_without_ddp.letr.load_state_dict(new_state_dict)

        # params
        encoder = {k:v for k,v in new_state_dict.items() if "encoder" in k}
        decoder = {k:v for k,v in new_state_dict.items() if "decoder" in k}
        class_embed = {k:v for k,v in new_state_dict.items() if "class_embed" in k}
        line_embed = {k:v for k,v in new_state_dict.items() if "lines_embed" in k}

        model_without_ddp.load_state_dict(encoder, strict=False)
        model_without_ddp.load_state_dict(decoder, strict=False)
        model_without_ddp.load_state_dict(class_embed, strict=False)
        model_without_ddp.load_state_dict(line_embed, strict=False)
        print('Finish load frozen_weights')
    else:
        print("NO RESUME. TRAIN FROM SCRATCH")
        
    if args.eval:
        test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args)
        #print('checkpoint'+ str(checkpoint['epoch']))
        return 

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        
        train_stats = train_one_epoch(model, criterion, postprocessors, data_loader_train, optimizer, device, epoch, args.clip_max_norm, args)
        
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoints/checkpoint.pth']
            # extra checkpoint before LR drop and every several epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % args.save_freq == 0:
                checkpoint_paths.append(output_dir / f'checkpoints/checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args)

        log_stats = {**{f'train_{k}': format(v, ".6f") for k, v in train_stats.items()},
                     **{f'test_{k}': format(v, ".6f") for k, v in test_stats.items()},
                     'epoch': epoch, 'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 8
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    wandb.init(project="qpic-project",
               entity="sangbaeklee",
               group="experiment_qpic")
    wandb.config = {
        "learning_rate": args.lr,
        "epochs": args.epochs,
        "batch_size": args.batch_size,
    }

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    wandb.watch(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if not args.hoi:
        if args.dataset_file == "coco_panoptic":
            # We also evaluate AP during panoptic training, on original coco DS
            coco_val = datasets.coco.build("val", args)
            base_ds = get_coco_api_from_dataset(coco_val)
        else:
            base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    elif args.pretrained:
        checkpoint = torch.load(args.pretrained, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)

    if args.eval:
        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            return
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)
            if args.output_dir:
                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                     output_dir / "eval.pth")
            return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            coco_evaluator = None
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        #import pdb; pdb.set_trace()
        if args.dataset_file == 'hico':
            wandb.log({
                "loss": train_stats['loss'],
                "mAP": test_stats['mAP'],
                "mAP rare": test_stats['mAP rare'],
                "mAP non-rare": test_stats['mAP non-rare'],
                "mean max recall": test_stats['mean max recall']
            })
        elif args.dataset_file == 'vcoco':
            wandb.log({
                "mAP_all": test_stats['mAP_all'],
                "mAP_thesis": test_stats['mAP_thesis'],
                "AP_hold_obj": test_stats['AP_hold_obj'],
                "AP_stand": test_stats['AP_stand'],
                "AP_sit_instr": test_stats['AP_sit_instr'],
                "AP_ride_instr": test_stats['AP_ride_instr'],
                "AP_walk": test_stats['AP_walk'],
                "AP_look_obj": test_stats['AP_look_obj'],
                "AP_hit_instr": test_stats['AP_hit_instr'],
                "AP_hit_obj": test_stats['AP_hit_obj'],
                "AP_eat_obj": test_stats['AP_eat_obj'],
                "AP_eat_instr": test_stats['AP_eat_instr'],
                "AP_jump_instr": test_stats['AP_jump_instr'],
                "AP_lay_instr": test_stats['AP_lay_instr'],
                "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'],
                "AP_carry_obj": test_stats['AP_carry_obj'],
                "AP_throw_obj": test_stats['AP_throw_obj'],
                "AP_catch_obj": test_stats['AP_catch_obj'],
                "AP_cut_instr": test_stats['AP_cut_instr'],
                "AP_cut_obj": test_stats['AP_cut_obj'],
                "AP_run": test_stats['AP_run'],
                "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'],
                "AP_ski_instr": test_stats['AP_ski_instr'],
                "AP_surf_instr": test_stats['AP_surf_instr'],
                "AP_skateboard_instr": test_stats['AP_skateboard_instr'],
                "AP_smile": test_stats['AP_smile'],
                "AP_drink_instr": test_stats['AP_drink_instr'],
                "AP_kick_obj": test_stats['AP_kick_obj'],
                "AP_point_instr": test_stats['AP_point_instr'],
                "AP_read_obj": test_stats['AP_read_obj'],
                "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\
                "loss" : train_stats['loss']
            })
        else:
            continue

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
def main(args):
    utils.init_distributed_mode(args)

    print("args: {}".format(args))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # special process to control whether freeze backbone
    args.model.train_backbone = args.lr_backbone > 0

    model, criterion, postprocessors = build_model(args.model)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   args.lr_drop,
                                                   gamma=args.lr_gamma)

    dataset_train = build_dataset(
        image_set='train',
        args=args.dataset,
        model_stride=model_without_ddp.backbone.stride)
    dataset_val = build_dataset(image_set='val',
                                args=args.dataset,
                                model_stride=model_without_ddp.backbone.stride)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    benchmark_test_parser = benchmark_test.get_args_parser()
    benchmark_test_args = benchmark_test_parser.get_defaults()
    benchmark_test_args.tracker.model = args.model  # overwrite the parameters about network model
    benchmark_test_args.result_path = Path(
        os.path.join(args.output_dir, 'benchmark'))
    benchmark_test_args.dataset_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'benchmark')

    benchmark_eval_parser = benchmark_eval.get_args_parser()
    benchmark_eval_args = benchmark_eval_parser.get_defaults()
    benchmark_eval_args.tracker_path = benchmark_test_args.result_path
    best_eao = 0
    best_ar = [0, 10]  # accuracy & robustness

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        # training
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every args.model_save_step epochs
            if (epoch + 1) % args.lr_drop == 0 or (
                    epoch + 1) % args.model_save_step == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

            # hack: only inference model
            utils.save_on_master({'model': model_without_ddp.state_dict()},
                                 output_dir / 'checkpoint_only_inference.pth')

        # evalute
        val_stats = evaluate(model, criterion, postprocessors, data_loader_val,
                             device, args.output_dir)

        log_stats = {
            'epoch': epoch,
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'val_{k}': v
               for k, v in val_stats.items()}, 'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

        # evualute with benchmark
        if utils.is_main_process():
            if (
                    epoch + 1
            ) % args.benchmark_test_step == 0 and epoch > args.benchmark_start_epoch:

                tracker = build_tracker(benchmark_test_args.tracker,
                                        model=model_without_ddp,
                                        postprocessors=postprocessors)
                benchmark_test_args.model_name = "epoch" + str(epoch)
                benchmark_start_time = time.time()
                benchmark_test.main(benchmark_test_args, tracker)
                benchmark_time = time.time() - benchmark_start_time

                benchmark_eval_args.model_name = "epoch" + str(epoch)
                benchmark_eval_args.tracker_prefix = "epoch" + str(epoch)
                eval_results = benchmark_eval.main(benchmark_eval_args)
                eval_result = list(eval_results.values())[0]

                if benchmark_test_args.dataset in ['VOT2018', 'VOT2019']:
                    if args.output_dir:
                        with (output_dir /
                              str("benchmark_" + benchmark_test_args.dataset +
                                  ".txt")).open("a") as f:
                            f.write("epoch: " + str(epoch) + ", best EAO: " +
                                    str(best_eao) + ", " +
                                    json.dumps(eval_result) + "\n")

                    if best_eao < eval_result['EAO']:

                        best_eao = eval_result['EAO']

                        if args.output_dir:
                            best_eao_int = int(best_eao * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_eao_{best_eao_int:03}_only_inference.pth'
                            )

                    if best_ar[0] < eval_result['accuracy'] and best_ar[
                            1] > eval_result['robustness']:

                        best_ar[0] = eval_result['accuracy']
                        best_ar[1] = eval_result['robustness']

                        if args.output_dir:
                            best_accuracy_int = int(best_ar[0] * 1000)
                            best_robustness_int = int(best_ar[1] * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_ar_{best_accuracy_int:03}_{best_robustness_int:03}_only_inference.pth'
                            )

                print("benchmark time: {}".format(benchmark_time))

        if args.distributed:
            torch.distributed.barrier()

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 10
0
        model, criterion, data_loader_train, optimizer, device, epoch,
        args.clip_max_norm)
    # train_stats = train_one_epoch(
    #     model, criterion, data_loader_train, optimizer, device, epoch,
    #     args.clip_max_norm)
    lr_scheduler.step()
    if args.output_dir:
        checkpoint_paths = [output_dir / 'checkpoint.pth']
        # extra checkpoint before LR drop and every 100 epochs
        if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
            checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
        for checkpoint_path in checkpoint_paths:
            utils.save_on_master({
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args,
            }, checkpoint_path)

    # test_stats, coco_evaluator = evaluate(
    #     model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
    # )
    coco_evaluator = None

    log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                 # **{f'test_{k}': v for k, v in test_stats.items()},
                 'epoch': epoch,
                 'n_parameters': n_parameters}

    if args.output_dir and utils.is_main_process():
Esempio n. 11
0
def main(args):
    utils.init_distributed_mode(args)
    print('git:\n  {}\n'.format(utils.get_sha()))

    print(args)

    device = torch.device(args.device)
    print(device)
    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    if args.stage == 1:
        for name, value in model_without_ddp.named_parameters():
            if 'iou' in name:
                value.requires_grad = False
        learned_params = filter(lambda p: p.requires_grad,
                                model_without_ddp.parameters())
    elif args.stage == 2:
        for name, value in model_without_ddp.named_parameters():
            if 'class_embed' not in name:
                value.requires_grad = False
        head_params = filter(lambda p: p.requires_grad,
                             model_without_ddp.parameters())
        learned_params = list(head_params)
    else:
        for name, value in model_without_ddp.named_parameters():
            if 'iou' not in name:
                value.requires_grad = False
        head_params = filter(lambda p: p.requires_grad,
                             model_without_ddp.parameters())
        learned_params = list(head_params)

    optimizer = torch.optim.AdamW(learned_params,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=thumos.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=thumos.collate_fn,
                                 num_workers=args.num_workers)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.rtd.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            print(("=> loading checkpoint '{}'".format(args.resume)))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            pretrained_dict = checkpoint['model']
            # only resume part of model parameter
            model_dict = model_without_ddp.state_dict()
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items() if k in model_dict
            }
            model_dict.update(pretrained_dict)
            model_without_ddp.load_state_dict(model_dict)
            # main_model.load_state_dict(checkpoint['state_dict'])
            print(("=> loaded '{}' (epoch {})".format(args.resume,
                                                      checkpoint['epoch'])))

    if args.load:
        checkpoint = torch.load(args.load, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])

    if args.eval:
        evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors,
                                             data_loader_val, device, args)
        res = evaluator.summarize()

        test_stats, results_pd = eval_props(res)
        print('test_stats', test_stats)

        if args.output_dir:
            results_pd.to_csv(args.output_dir + 'results_eval.csv')
        return

    print('Start training')
    start_time = time.time()

    fig1 = plt.figure('train', figsize=(18.5, 10.5))
    ax1_train = fig1.add_subplot(231)
    ax2_train = fig1.add_subplot(232)
    ax3_train = fig1.add_subplot(233)
    ax4_train = fig1.add_subplot(234)
    ax5_train = fig1.add_subplot(235)
    ax6_train = fig1.add_subplot(236)

    axs_train = {
        'loss_ce': ax1_train,
        'loss_bbox': ax2_train,
        'loss_giou': ax3_train,
        'cardinality_error': ax4_train,
        'class_error': ax5_train,
        'loss_iou': ax6_train
    }

    fig2 = plt.figure('eval', figsize=(18.5, 10.5))
    ax1_eval = fig2.add_subplot(231)
    ax2_eval = fig2.add_subplot(232)
    ax3_eval = fig2.add_subplot(233)
    ax4_eval = fig2.add_subplot(234)
    ax5_eval = fig2.add_subplot(235)
    ax6_eval = fig2.add_subplot(236)
    axs_eval = {
        'loss_ce': ax1_eval,
        'loss_bbox': ax2_eval,
        'loss_giou': ax3_eval,
        'cardinality_error': ax4_eval,
        'class_error': ax5_eval,
        'loss_iou': ax6_eval
    }

    colordict = {
        '50': 'g',
        '100': 'b',
        '200': 'purple',
        '500': 'orange',
        '1000': 'brown'
    }

    fig3 = plt.figure('test_AR')
    axs_test = fig3.add_subplot(111)

    epoch_list = []
    train_loss_list = {}
    eval_loss_list = {}
    test_stats_list = {}
    best_ar50 = 0
    best_sum_ar = 0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        train_stats, train_loss_dict = train_one_epoch(model, criterion,
                                                       data_loader_train,
                                                       optimizer, device,
                                                       epoch, args)

        for key, value in train_loss_dict.items():
            if key in [
                    'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error',
                    'class_error', 'loss_iou'
            ]:
                try:
                    train_loss_list[key].append(value.mean())
                except KeyError:
                    train_loss_list[key] = [value.mean()]

        lr_scheduler.step()
        if epoch % 50 == 0 and args.output_dir:
            checkpoint_path = output_dir / 'checkpoint_epoch{}.pth'.format(
                epoch)
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors,
                                             data_loader_val, device, args)
        res = evaluator.summarize()

        test_stats, results_pd = eval_props(res)
        for k, v in test_stats.items():
            try:
                test_stats_list[k].append(float(v) * 100)
            except KeyError:
                test_stats_list[k] = [float(v) * 100]

        for key, value in eval_loss_dict.items():
            if key in [
                    'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error',
                    'class_error', 'loss_iou'
            ]:
                try:
                    eval_loss_list[key].append(value.mean())
                except KeyError:
                    eval_loss_list[key] = [value.mean()]

        print('test_stats', test_stats)

        # debug
        # if args.output_dir:
        #     results_pd.to_csv(args.output_dir+'results_epoch_{}.csv'.format(epoch))

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_AR@{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        if (float(test_stats['50']) > best_ar50):
            best_ar50 = float(test_stats['50'])
            with (output_dir / 'log_best_ar50.txt').open('w') as f:
                f.write(json.dumps(log_stats) + '\n')
            checkpoint_path = output_dir / 'checkpoint_best_ar50.pth'
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)
        current_sum_ar = float(test_stats['50']) + float(
            test_stats['100']) + float(test_stats['200'])
        if (current_sum_ar > best_sum_ar):
            best_sum_ar = current_sum_ar
            with (output_dir / 'log_best_sum_ar.txt').open('w') as f:
                f.write(json.dumps(log_stats) + '\n')
            checkpoint_path = output_dir / 'checkpoint_best_sum_ar.pth'
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        if args.output_dir and utils.is_main_process():
            with (output_dir / 'log.txt').open('a') as f:
                f.write(json.dumps(log_stats) + '\n')
        epoch_list.append(epoch)
        if epoch % 2 == 0:
            # split, loss_dict, axs, epoch, color_dict

            draw_stats(axs_test, test_stats_list, epoch_list, colordict)
            axs_test.legend()
            draw('train', train_loss_list, axs_train, epoch, 'b')
            draw('eval', eval_loss_list, axs_eval, epoch, 'g')
            fig1.savefig('train_loss_curve.jpg', dpi=300)
            fig2.savefig('eval_loss_curve.jpg', dpi=300)
            fig3.savefig('test_ar.jpg')

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 12
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    device = torch.device(args.device)

    model = fasterrcnn_resnet_fpn(num_classes=2, pretrained=False)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    params = [p for p in model_without_ddp.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(params,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        # sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        # sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, collate_fn=utils.collate_fn,
    #                              drop_last=False, num_workers=args.num_workers)
    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    # if args.eval:
    #     test_stats = evaluate(model, data_loader_val, device=device)

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, optimizer, data_loader_train,
                                      device, epoch)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)
        # test_stats = evaluate(model, data_loader_val, device)
        # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
        #              **{f'test_{k}': v for k, v in test_stats.items()},
        #              'epoch': epoch,
        #              'n_parameters': n_parameters}

        # if args.output_dir and utils.is_main_process():
        #     with (output_dir / "log.txt").open("a") as f:
        #         f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 13
0
def main(args):
    utils.init_distributed_mode(
        args
    )  # 是与分布式训练相关的设置,在该方法里,是通过环境变量来判断是否使用分布式训练,如果是,那么就设置相关参数,具体可参考util/misc.py文件中的源码,这里不作解析。
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:  # 代表是否固定住参数的权重,类似于迁移学习的微调。如果是,那么需要同时指定 masks 参数,代表这种条件仅适用于分割任务。
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    # 固定随机种子以便复现,get_rank()是分布式节点的编号
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # 根据参数构建模型,loss函数以及后处理方法
    model, criterion, postprocessors = build_model(args)
    model.to(device)

    # ddp是DistributeDataParallel的缩写
    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    # 统计并输出可训练的参数数量
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    # 设置优化器、学习率策略以及构建训练和验证集。
    # 将backbone与其他部分的参数分开,以便使用不同的初始学习率进行训练
    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
    # 构造数据集使用的 build_dataset() 方法调用了COCO数据集的api,位于datasets/__init__.py文件
    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)
    # 构造了数据集后,设置数据集的采样器,并且装在到 DataLoader,以进行批次训练。注意到使用了 collate_fn 方法来重新组装一个batch的数据
    # collate_fn在util/misc.py
    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        # 类似迁移学习的微调,固定住权重,仅训练分割头
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    # 用于从历史的某个训练阶段中恢复过来,包括加载当时的模型权重、优化器和学习率以及周期等参数。
    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    # 此参数设置了代表仅进行测试而不进行训练
    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    # 接下来真正开始一个个周期地训练,每个周期后根据学习率策略调整下学习率。
    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        # 获取一个周期的训练结果
        # 这部分对应的代码在detr/engine.py中的 train_one_epoch() 方法,顾名思义,这部分内容就是模型在一个训练周期中的操作。
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        # 将训练结果和相关参数记录到指定文件
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            # 记录训练结果和学习率等参数
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)
        # 每训练一个周期后再验证集上进行评估验证
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        # 将训练和验证的结果记录到(分布式)主节点中的指定文件
        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        # 记录评估验证结果
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)
    # 最后计算训练的总共耗时并打印,整个训练流程到此结束
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 14
0
File: main.py Progetto: wx-b/VisTR
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # no validation ground truth for ytvos dataset
    dataset_train = build_dataset(image_set='train', args=args)
    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)

    output_dir = Path(args.output_dir)

    # load coco pretrained weight
    checkpoint = torch.load(args.pretrained_weights,
                            map_location='cpu')['model']
    del checkpoint["vistr.class_embed.weight"]
    del checkpoint["vistr.class_embed.bias"]
    del checkpoint["vistr.query_embed.weight"]
    model.module.load_state_dict(checkpoint, strict=False)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 15
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    image_set = 'fewshot' if args.fewshot_finetune else 'train'
    dataset_train = build_dataset(image_set=image_set, args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    dataset_support = build_support_dataset(image_set=image_set, args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
            sampler_support = samplers.NodeDistributedSampler(dataset_support)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
            sampler_support = samplers.DistributedSampler(dataset_support)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
        sampler_support = torch.utils.data.RandomSampler(dataset_support)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=False)

    loader_train = DataLoader(dataset_train,
                              batch_sampler=batch_sampler_train,
                              collate_fn=utils.collate_fn,
                              num_workers=args.num_workers,
                              pin_memory=True)

    loader_val = DataLoader(dataset_val,
                            batch_size=args.batch_size,
                            sampler=sampler_val,
                            drop_last=False,
                            collate_fn=utils.collate_fn,
                            num_workers=args.num_workers,
                            pin_memory=True)

    loader_support = DataLoader(dataset_support,
                                batch_size=1,
                                sampler=sampler_support,
                                drop_last=False,
                                num_workers=args.num_workers,
                                pin_memory=False)

    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    if not args.fewshot_finetune:
        param_dicts = [{
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if not match_name_keywords(n, args.lr_backbone_names)
                and not match_name_keywords(n, args.lr_linear_proj_names)
                and p.requires_grad
            ],
            "lr":
            args.lr,
            "initial_lr":
            args.lr,
        }, {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if match_name_keywords(n, args.lr_backbone_names)
                and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
            "initial_lr":
            args.lr_backbone,
        }, {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if match_name_keywords(n, args.lr_linear_proj_names)
                and p.requires_grad
            ],
            "lr":
            args.lr * args.lr_linear_proj_mult,
            "initial_lr":
            args.lr * args.lr_linear_proj_mult,
        }]
    else:
        # For few-shot finetune stage, do not train sampling offsets, reference points, and embedding related parameters
        param_dicts = [
            {
                "params":
                    [p for n, p in model_without_ddp.named_parameters()
                     if not match_name_keywords(n, args.lr_backbone_names) and \
                        not match_name_keywords(n, args.lr_linear_proj_names) and \
                        not match_name_keywords(n, args.embedding_related_names) and p.requires_grad],
                "lr": args.lr,
                "initial_lr": args.lr,
            },
            {
                "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
                "lr": args.lr_backbone,
                "initial_lr": args.lr_backbone,
            },
        ]

    optimizer = torch.optim.AdamW(param_dicts, weight_decay=args.weight_decay)
    lr_scheduler = WarmupMultiStepLR(optimizer,
                                     args.lr_drop_milestones,
                                     gamma=0.1,
                                     warmup_epochs=args.warmup_epochs,
                                     warmup_factor=args.warmup_factor,
                                     warmup_method='linear',
                                     last_epoch=args.start_epoch - 1)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.dataset.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

        if args.fewshot_finetune:
            if args.category_codes_cls_loss:
                # Re-init weights of novel categories for few-shot finetune
                novel_class_ids = datasets.get_class_ids(args.dataset_file,
                                                         type='novel')
                if args.num_feature_levels == 1:
                    for novel_class_id in novel_class_ids:
                        nn.init.normal_(model_without_ddp.category_codes_cls.L.
                                        weight[novel_class_id])
                elif args.num_feature_levels > 1:
                    for classifier in model_without_ddp.category_codes_cls:
                        for novel_class_id in novel_class_ids:
                            nn.init.normal_(
                                classifier.L.weight[novel_class_id])
                else:
                    raise RuntimeError

    if args.eval:
        # Evaluate only base categories
        test_stats, coco_evaluator = evaluate(args,
                                              model,
                                              criterion,
                                              postprocessors,
                                              loader_val,
                                              loader_support,
                                              base_ds,
                                              device,
                                              type='base')
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval_base.pth")

        # Evaluate only novel categories
        test_stats, coco_evaluator = evaluate(args,
                                              model,
                                              criterion,
                                              postprocessors,
                                              loader_val,
                                              loader_support,
                                              base_ds,
                                              device,
                                              type='novel')
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval_novel.pth")

        return

    print("Start training...")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(args, model, criterion, loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        # Saving Checkpoints after each epoch
        if args.output_dir and (not args.fewshot_finetune):
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # Saving Checkpoints every args.save_every_epoch epoch(s)
        if args.output_dir:
            checkpoint_paths = []
            if (epoch + 1) % args.save_every_epoch == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        # Evaluation and Logging
        if (epoch + 1) % args.eval_every_epoch == 0:
            if 'base' in args.dataset_file:
                evaltype = 'base'
            else:
                evaltype = 'all'
            if args.fewshot_finetune:
                evaltype = 'novel'

            test_stats, coco_evaluator = evaluate(args,
                                                  model,
                                                  criterion,
                                                  postprocessors,
                                                  loader_val,
                                                  loader_support,
                                                  base_ds,
                                                  device,
                                                  type=evaltype)

            log_stats = {
                **{f'train_{k}': v
                   for k, v in train_stats.items()},
                **{f'test_{k}': v
                   for k, v in test_stats.items()}, 'epoch': epoch,
                'n_parameters': n_parameters,
                'evaltype': evaltype
            }

            if args.output_dir and utils.is_main_process():
                with (output_dir / "results.txt").open("a") as f:
                    f.write(json.dumps(log_stats) + "\n")
                # for evaluation logs
                if coco_evaluator is not None:
                    (output_dir / 'eval').mkdir(exist_ok=True)
                    if "bbox" in coco_evaluator.coco_eval:
                        filenames = ['latest.pth']
                        filenames.append(f'{epoch:03}.pth')
                        for name in filenames:
                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                       output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 16
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
    if args.det_val:
        assert args.eval, 'only support eval mode of detector for track'
        model, criterion, postprocessors = build_model(args)
    elif args.eval:
        model, criterion, postprocessors = build_tracktest_model(args)
    else:
        model, criterion, postprocessors = build_tracktrain_model(args)

    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_train = build_dataset(image_set=args.track_train_split, args=args)
    dataset_val = build_dataset(image_set=args.track_eval_split, args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
            args.override_resumed_lr_drop = True
            if args.override_resumed_lr_drop:
                print(
                    'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.'
                )
                lr_scheduler.step_size = args.lr_drop
                lr_scheduler.base_lrs = list(
                    map(lambda group: group['initial_lr'],
                        optimizer.param_groups))
            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1
        # check the resumed model
#         if not args.eval:
#             test_stats, coco_evaluator, _ = evaluate(
#                 model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
#             )

    if args.eval:
        assert args.batch_size == 1, print("Now only support 1.")
        tracker = Tracker(score_thresh=args.track_thresh)
        test_stats, coco_evaluator, res_tracks = evaluate(model,
                                                          criterion,
                                                          postprocessors,
                                                          data_loader_val,
                                                          base_ds,
                                                          device,
                                                          args.output_dir,
                                                          tracker=tracker,
                                                          phase='eval',
                                                          det_val=args.det_val,
                                                          fp16=args.fp16)
        if args.output_dir:
            #             utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
            if res_tracks is not None:
                print("Creating video index for {}.".format(args.dataset_file))
                video_to_images = defaultdict(list)
                video_names = defaultdict()
                for _, info in dataset_val.coco.imgs.items():
                    video_to_images[info["video_id"]].append({
                        "image_id":
                        info["id"],
                        "frame_id":
                        info["frame_id"]
                    })
                    video_name = info["file_name"].split("/")[0]
                    if video_name not in video_names:
                        video_names[info["video_id"]] = video_name
                assert len(video_to_images) == len(video_names)
                # save mot results.
                save_track(res_tracks, args.output_dir, video_to_images,
                           video_names, args.track_eval_split)

        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model,
                                      criterion,
                                      data_loader_train,
                                      optimizer,
                                      device,
                                      scaler,
                                      epoch,
                                      args.clip_max_norm,
                                      fp16=args.fp16)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if epoch % 10 == 0 or epoch > args.epochs - 5:
            test_stats, coco_evaluator, _ = evaluate(model,
                                                     criterion,
                                                     postprocessors,
                                                     data_loader_val,
                                                     base_ds,
                                                     device,
                                                     args.output_dir,
                                                     fp16=args.fp16)
            log_test_stats = {
                **{f'test_{k}': v
                   for k, v in test_stats.items()}
            }
            log_stats.update(log_test_stats)

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs


#             if coco_evaluator is not None:
#                 (output_dir / 'eval').mkdir(exist_ok=True)
#                 if "bbox" in coco_evaluator.coco_eval:
#                     filenames = ['latest.pth']
#                     if epoch % 50 == 0:
#                         filenames.append(f'{epoch:03}.pth')
#                     for name in filenames:
#                         torch.save(coco_evaluator.coco_eval["bbox"].eval,
#                                    output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 17
0
def main(args, exp_cfg):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    # device = torch.device('cuda')
    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # model, criterion, postprocessors = build_model(args)
    model = SMPLXNet(exp_cfg)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)
    # for n, p in model_without_ddp.named_parameters():
    #     print(n)
    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)

    print('start build dataset')
    datasets = make_all_datasets(exp_cfg, split='train')
    # dataset_train = ConcatDataset(datasets['body'])
    dataset_train = ConcatDataset(datasets['body'] + datasets['hand'] +
                                  datasets['head'])

    print('finish build dataset')

    sample_weight = [
        child_dataset.sample_weight for child_dataset in dataset_train.datasets
    ]
    sample_weight = np.concatenate(sample_weight, axis=0)
    sampler_train = torch.utils.data.sampler.WeightedRandomSampler(
        sample_weight, len(dataset_train))
    # sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    if args.distributed:
        sampler_train = samplers.DistributedSampler(sampler_train)
        # sampler_val = samplers.DistributedSampler(sampler_val, shuffle=False)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    collate_fn = functools.partial(collate_batch,
                                   use_shared_memory=args.num_workers > 0,
                                   return_full_imgs=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)

    # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
    #                              drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
    #                              pin_memory=True)

    optim_cfg = exp_cfg.get('optim', {})
    optimizer = build_optimizer(model, optim_cfg)
    lr_scheduler = build_scheduler(optimizer, optim_cfg['scheduler'])

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.pretrain:
        checkpoint = torch.load(args.pretrain, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))

        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])

            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    output_dir = Path(args.output_dir)
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, data_loader_train, optimizer,
                                      device, epoch)
        # print('DEBUG!!!!!!!!!'); train_stats = {}
        lr_scheduler.step()

        if args.output_dir:
            if not os.path.exists(args.output_dir) and utils.is_main_process():
                os.makedirs(args.output_dir)
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 1 epochs
            if (epoch + 1) % args.save_freq == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 18
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    Dataset = get_dataset(args.dataset, args.task)
    f = open(args.data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()

    normalize = T.Compose([
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]

    transforms = T.Compose([
        T.RandomHorizontalFlip(),
        T.RandomSelect(
            T.RandomResize(scales, max_size=1333),
            T.Compose([
                T.RandomResize([400, 500, 600]),
                T.RandomSizeCrop(384, 600),
                # T.RandomSizeCrop_MOT(384, 600),
                T.RandomResize(scales, max_size=1333),
            ])),
        normalize,
    ])
    dataset_train = Dataset(args,
                            dataset_root,
                            trainset_paths, (1088, 608),
                            augment=True,
                            transforms=transforms)
    args.nID = dataset_train.nID

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    # dataset_train = build_dataset(image_set='train', args=args)
    # dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            # sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            # sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        # sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)

    # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
    #                              drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
    #                              pin_memory=True)

    # data_loader_train = torch.utils.data.DataLoader(
    #     dataset_train,
    #     batch_size=args.batch_size,
    #     shuffle=True,
    #     num_workers=args.num_workers,
    #     pin_memory=True,
    #     drop_last=True
    # )

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    # 用于将classifer不更新参数
    # for name,p in model_without_ddp.named_parameters():
    #     if name.startswith('classifier'):
    #         p.requires_grad = False

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
    # optimizer.add_param_group({'params': criterion.parameters()})

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
            model_dict = model_without_ddp.state_dict()  #当前模型参数
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k not in [
                    "class_embed.0.weight", "class_embed.0.bias",
                    "class_embed.1.weight", "class_embed.1.bias",
                    "class_embed.2.weight", "class_embed.2.bias",
                    "class_embed.3.weight", "class_embed.3.bias",
                    "class_embed.4.weight", "class_embed.4.bias",
                    "class_embed.5.weight", "class_embed.5.bias"
                ]
            }
            model_dict.update(pretrained_dict)

        # missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            model_dict, strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:

            args.start_epoch = checkpoint['epoch'] + 1
            # optimizer.load_state_dict(checkpoint['optimizer'])
        # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
        #     import copy
        #     p_groups = copy.deepcopy(optimizer.param_groups)
        #     # optimizer.load_state_dict(checkpoint['optimizer'])
        # for pg, pg_old in zip(optimizer.param_groups, p_groups):
        #     pg['lr'] = pg_old['lr']
        #     pg['initial_lr'] = pg_old['initial_lr']
        # # print(optimizer.param_groups)
        # lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
        # args.override_resumed_lr_drop = True
        # if args.override_resumed_lr_drop:
        #     print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
        #     lr_scheduler.step_size = args.lr_drop
        #     lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        # lr_scheduler.step(lr_scheduler.last_epoch)

    # model.add_module('id')

    # [p for p in model.named_parameters() if not p[1].requires_grad]
    # 用于将classifer不更新参数
    # optimizer = torch.optim.SGD(filter(lambda x: "classifier" not in x[0], model.parameters()), lr=args.lr,
    #                 momentum=0.9, weight_decay=1e-4)
    # model.classifier.training = False
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)
    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(args, model, criterion,
                                      data_loader_train, optimizer, device,
                                      epoch, args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 19
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    # align with DETR format
    args.dataset_file = 'ImageNet'
    args.masks = None
    # freeze cnn weights
    args.lr_backbone = 0 if args.fre_cnn else args.lr
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.updetr_collate_fn,
                                   num_workers=args.num_workers)

    print(len(data_loader_train) * args.epochs)

    output_dir = Path(args.output_dir)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            if lr_scheduler.step_size != args.lr_drop:
                lr_scheduler.step_size = args.lr_drop
            args.start_epoch = checkpoint['epoch'] + 1

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 20 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 20 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 20
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    dataset_test = build_dataset(image_set='test', args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val,
                                                          shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val,
                                                      shuffle=False)
            sampler_test = samplers.DistributedSampler(dataset_test,
                                                       shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
        sampler_test = torch.utils.data.SequentialSampler(dataset_test)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers,
                                 pin_memory=True)
    data_loader_test = DataLoader(dataset_test,
                                  args.batch_size,
                                  sampler=sampler_val,
                                  drop_last=False,
                                  collate_fn=utils.collate_fn,
                                  num_workers=args.num_workers,
                                  pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [{
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if not match_name_keywords(n, args.lr_backbone_names)
            and not match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters() if
            match_name_keywords(n, args.lr_backbone_names) and p.requires_grad
        ],
        "lr":
        args.lr_backbone,
    }, {
        "params": [
            p for n, p in model_without_ddp.named_parameters()
            if match_name_keywords(n, args.lr_linear_proj_names)
            and p.requires_grad
        ],
        "lr":
        args.lr * args.lr_linear_proj_mult,
    }]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts,
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts,
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        del checkpoint["model"]["transformer.decoder.class_embed.0.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.0.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.1.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.1.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.2.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.2.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.3.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.3.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.4.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.4.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.5.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.5.bias"]
        del checkpoint["model"]["transformer.decoder.class_embed.6.weight"]
        del checkpoint["model"]["transformer.decoder.class_embed.6.bias"]
        del checkpoint["model"]["class_embed.0.weight"]
        del checkpoint["model"]["class_embed.0.bias"]
        del checkpoint["model"]["class_embed.1.weight"]
        del checkpoint["model"]["class_embed.1.bias"]
        del checkpoint["model"]["class_embed.2.weight"]
        del checkpoint["model"]["class_embed.2.bias"]
        del checkpoint["model"]["class_embed.3.weight"]
        del checkpoint["model"]["class_embed.3.bias"]
        del checkpoint["model"]["class_embed.4.weight"]
        del checkpoint["model"]["class_embed.4.bias"]
        del checkpoint["model"]["class_embed.5.weight"]
        del checkpoint["model"]["class_embed.5.bias"]
        del checkpoint["model"]["class_embed.6.weight"]
        del checkpoint["model"]["class_embed.6.bias"]
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(
            checkpoint['model'], strict=False)
        unexpected_keys = [
            k for k in unexpected_keys
            if not (k.endswith('total_params') or k.endswith('total_ops'))
        ]
        # if len(missing_keys) > 0:
        #     print('Missing Keys: {}'.format(missing_keys))
        # if len(unexpected_keys) > 0:
        #     print('Unexpected Keys: {}'.format(unexpected_keys))
        # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
        #     import copy
        #     p_groups = copy.deepcopy(optimizer.param_groups)
        #     optimizer.load_state_dict(checkpoint['optimizer'])
        #     for pg, pg_old in zip(optimizer.param_groups, p_groups):
        #         pg['lr'] = pg_old['lr']
        #         pg['initial_lr'] = pg_old['initial_lr']
        #     #print(optimizer.param_groups)
        #     lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        #     # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
        #     args.override_resumed_lr_drop = True
        #     if args.override_resumed_lr_drop:
        #         print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
        #         lr_scheduler.step_size = args.lr_drop
        #         lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        #     lr_scheduler.step(lr_scheduler.last_epoch)
        #     args.start_epoch = checkpoint['epoch'] + 1
        # # check the resumed model
        if not args.eval:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return
    if args.test:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_test, base_ds,
                                              device, args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 21
0
File: train.py Progetto: zepingz/EBM
def main(args):
    print(args)

    device = args.device
    if args.output_dir:
        save_dir = os.path.join(args.output_dir, f"ebm_{args.counter}")
        os.makedirs(save_dir, exist_ok=True)
    else:
        save_dir = None

    # Build dataloader
    train_dataset, val_dataset, train_linpred_dataset, val_linpred_dataset = build_dataset(
        args)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        # drop_last=True,
        pin_memory=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        # drop_last=True,
        pin_memory=True,
    )
    args.ptp_size = train_dataset._ptp_size

    train_linpred_dataloader = DataLoader(
        train_linpred_dataset,
        batch_size=args.linpred_batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        # drop_last=True,
        pin_memory=True,
    )
    val_linpred_dataloader = DataLoader(
        val_linpred_dataset,
        batch_size=args.linpred_batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        # drop_last=True,
        pin_memory=True,
    )

    # Fix the seed for reproducibility
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    # Initialize model
    model = build_model(args)
    model.to(device)

    model_without_ddp = model
    # if args.distributed:
    #     model = nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
    #     model_without_ddp = model.module

    encoder_n_parameters = sum(
        p.numel() for p in model_without_ddp.frame_encoder.parameters())
    decoder_n_parameters = sum(
        p.numel() for p in model_without_ddp.frame_decoder.parameters())
    predictor_n_parameters = sum(
        p.numel() for p in model_without_ddp.hidden_predictor.parameters())
    print((f"Number of params\n"
           f"encoder: {encoder_n_parameters}\n"
           f"decoder: {decoder_n_parameters}\n"
           f"predictor: {predictor_n_parameters}"))

    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    # Setup comet ml
    api_key = os.environ.get("COMET_API_KEY")
    project_name = os.environ.get("COMET_PROJECT_NAME")
    workspace = os.environ.get("COMET_WORKSPACE")
    do_log = (api_key is not None and project_name is not None
              and workspace is not None)
    if do_log:
        experiment = Experiment(
            api_key=api_key,
            project_name=project_name,
            workspace=workspace,
        )
        experiment.set_name(f"ebm_{args.counter}")
    else:
        experiment = None

    print("Start training")
    for epoch in range(args.epochs):
        # Train
        train_stats = train_one_epoch(
            model,
            train_dataloader,
            optimizer,
            1 if args.no_latent else args.batch_repeat_step,
            device,
            epoch,
            args.clip_max_norm,
            experiment,
        )
        # lr_scheduler.step()

        # Save model
        if save_dir:
            checkpoint_path = os.path.join(save_dir, f"{epoch}epoch.pth")
            utils.save_on_master(
                {
                    "model": model_without_ddp.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    # "lr_scheduler": lr_scheduler.state_dict(),
                    "epoch": epoch,
                    "args": args,
                },
                checkpoint_path)

        # Val
        val_stats = evaluate(model, val_dataloader, device, epoch, experiment)

        # Encoder val
        if not args.no_linpred_eval and\
        (epoch % args.linpred_interval == 0 or epoch == args.epochs):
            linear_predictor = LinearPredictor(
                args.embedding_size,
                10 if args.dataset == "moving_mnist" else 8,  # stupid hack
                copy.deepcopy(model.frame_encoder),
            ).to(device)
            linear_optimizer = optim.Adam(linear_predictor.parameters(),
                                          lr=args.linpred_lr)
            linpred_acc = encoder_evaluate(
                linear_predictor,
                linear_optimizer,
                train_linpred_dataloader,
                val_linpred_dataloader,
                args.linpred_epochs,
                device,
                epoch,
                experiment,
            )
Esempio n. 22
0
def main(args):
    # args = parser.parse_args()
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    if args.seed is not None:
        # random.seed(args.seed)
        # torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

        # fix the seed for reproducibility
        seed = args.seed + utils.get_rank()
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

    ##################################
    # Logging setting
    ##################################
    if args.output_dir and utils.is_main_process():
        logging.basicConfig(
            filename=os.path.join(args.output_dir, args.log_name),
            filemode='w',
            format=
            '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s',
            level=logging.INFO)
    warnings.filterwarnings("ignore")

    ##################################
    # Save to logging
    ##################################
    if utils.is_main_process():
        logging.info(str(args))

    ##################################
    # Initialize dataset
    ##################################

    if not args.evaluate:
        # build_vocab_flag=True, # Takes a long time to build a vocab
        train_dataset = GQATorchDataset(split='train_unbiased',
                                        build_vocab_flag=False,
                                        load_vocab_flag=False)

        if args.distributed:
            sampler_train = torch.utils.data.DistributedSampler(train_dataset)
        else:
            sampler_train = torch.utils.data.RandomSampler(train_dataset)

        batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                            args.batch_size,
                                                            drop_last=True)

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_sampler=batch_sampler_train,
            collate_fn=GQATorchDataset_collate_fn,
            num_workers=args.workers)

        # Old version
        # train_loader = torch.utils.data.DataLoader(
        #     train_dataset, batch_size=args.batch_size, shuffle=True,
        #     collate_fn=GQATorchDataset_collate_fn,
        #     num_workers=args.workers, pin_memory=True)

    val_dataset_list = []
    for eval_split in args.evaluate_sets:
        val_dataset_list.append(
            GQATorchDataset(split=eval_split,
                            build_vocab_flag=False,
                            load_vocab_flag=args.evaluate))
    val_dataset = torch.utils.data.ConcatDataset(val_dataset_list)

    if args.distributed:
        sampler_val = torch.utils.data.DistributedSampler(val_dataset,
                                                          shuffle=False)
    else:
        sampler_val = torch.utils.data.SequentialSampler(val_dataset)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        sampler=sampler_val,
        drop_last=False,
        collate_fn=GQATorchDataset_collate_fn,
        num_workers=args.workers)

    # Old version
    # val_loader = torch.utils.data.DataLoader(
    #     val_dataset,
    #     batch_size=args.batch_size, shuffle=False,
    #     collate_fn=GQATorchDataset_collate_fn,
    #     num_workers=args.workers, pin_memory=True)

    ##################################
    # Initialize model
    # - note: must init dataset first. Since we will use the vocab from the dataset
    ##################################
    model = PipelineModel()

    ##################################
    # Deploy model on GPU
    ##################################
    model = model.to(device=cuda)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    ##################################
    # define optimizer (and scheduler)
    ##################################

    # optimizer = torch.optim.SGD(model.parameters(), args.lr,
    #                             momentum=args.momentum,
    #                             weight_decay=args.weight_decay)
    optimizer = torch.optim.Adam(
        params=model.parameters(),
        lr=args.lr,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=0,  #  weight_decay=args.weight_decay
        amsgrad=False,
    )
    # optimizer = torch.optim.AdamW(
    #     params=model.parameters(),
    #     lr=args.lr,
    #     weight_decay=args.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model_without_ddp.load_state_dict(checkpoint['model'])
            if not args.evaluate:
                if 'optimizer' in checkpoint:
                    optimizer.load_state_dict(checkpoint['optimizer'])
                if 'lr_scheduler' in checkpoint:
                    lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
                if 'epoch' in checkpoint:
                    args.start_epoch = checkpoint['epoch'] + 1

            # checkpoint = torch.load(args.resume)
            # args.start_epoch = checkpoint['epoch']
            # model.load_state_dict(checkpoint['state_dict'])
            # optimizer.load_state_dict(checkpoint['optimizer'])
            # print("=> loaded checkpoint '{}' (epoch {})"
            #       .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # cudnn.benchmark = True

    ##################################
    # Define loss functions (criterion)
    ##################################
    # criterion = torch.nn.CrossEntropyLoss().cuda()

    text_pad_idx = GQATorchDataset.TEXT.vocab.stoi[
        GQATorchDataset.TEXT.pad_token]
    criterion = {
        "program":
        torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda),
        "full_answer":
        torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda),
        "short_answer":
        torch.nn.CrossEntropyLoss().to(device=cuda),
        # "short_answer": torch.nn.BCEWithLogitsLoss().to(device=cuda), # sigmoid
        "execution_bitmap":
        torch.nn.BCELoss().to(device=cuda),
    }

    ##################################
    # If Evaluate Only
    ##################################

    if args.evaluate:
        validate(val_loader, model, criterion, args, DUMP_RESULT=True)
        return

    ##################################
    # Main Training Loop
    ##################################

    # best_acc1 = 0
    for epoch in range(args.start_epoch, args.epochs):

        if args.distributed:
            ##################################
            # In distributed mode, calling the :meth`set_epoch(epoch) <set_epoch>` method
            # at the beginning of each epoch before creating the DataLoader iterator is necessary
            # to make shuffling work properly across multiple epochs.
            # Otherwise, the same ordering will be always used.
            ##################################
            sampler_train.set_epoch(epoch)

        lr_scheduler.step()

        # adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)
        # evaluate on validation set
        if (epoch + 1) % 5 == 0:
            validate(val_loader,
                     model,
                     criterion,
                     args,
                     FAST_VALIDATE_FLAG=False)

        # # remember best acc@1 and save checkpoint
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     # 'arch': args.arch,
        #     'state_dict': model.state_dict(),
        #     # 'best_acc1': best_acc1,
        #     'optimizer' : optimizer.state_dict(),
        # }, is_best)

        if args.output_dir:
            output_dir = pathlib.Path(args.output_dir)
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)
Esempio n. 23
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(split='train', args=args)
    dataset_val = build_dataset(split='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    elif args.dataset_file == "coco":
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    # if args.eval:
    #     if 'coco' in args.dataset_file:
    #         test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
    #                                             data_loader_val, base_ds, device, args.output_dir)
    #         if args.output_dir:
    #             utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
    #     elif 'anet' == args.dataset_file:
    #         evaluate3d(model, postprocessors, data_loader_val, device, epoch=0)
    #     return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if epoch % args.eval_freq == 0:
            if 'coco' in args.dataset_file:
                test_stats, coco_evaluator = evaluate(model, criterion,
                                                      postprocessors,
                                                      data_loader_val, base_ds,
                                                      device, args.output_dir)
            elif 'anet' == args.dataset_file:
                evaluate3d(model, postprocessors, data_loader_val, device,
                           epoch)
Esempio n. 24
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   args.lr_drop,
                                                   gamma=0.9)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    output_dir = output_dir / f"{args.backbone}_{args.transformer_type}"
    if args.output_dir:
        output_dir.mkdir(parents=True, exist_ok=True)

    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')

        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch}_extra.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 25
0
def main(args):
    utils.init_distributed_mode(args)
    print('git:\n  {}\n'.format(utils.get_sha()))
    if args.frozen_weights is not None:
        assert args.masks, 'Frozen training is meant for segmentation only'
    print(args)
    device = args.device
    device = device.replace('cuda', 'gpu')
    device = paddle.set_device(device)
    seed = args.seed + utils.get_rank()
    paddle.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    model, criterion, postprocessors = build_model(args)
    model.to(device)
    model_without_ddp = model
    if args.distributed:
        model = paddle.DataParallel(model)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)
    param_dicts = [{
        'params': [
            p for n, p in model_without_ddp.named_parameters()
            if 'backbone' not in n and p.requires_grad
        ]
    }, {
        'params': [
            p for n, p in model_without_ddp.named_parameters()
            if 'backbone' in n and p.requires_grad
        ],
        'lr':
        args.lr_backbone
    }]
    optimizer = torch2paddle.AdamW(param_dicts, lr=args.lr, weight_decay=\
        args.weight_decay)
    lr_scheduler = paddle.optimizer.lr.StepDecay(step_size=args.lr_drop,
                                                 learning_rate=0.01)
    optimizer._learning_rate = lr_scheduler
    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)
    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = paddle.io.RandomSampler(dataset_train)
        sampler_val = paddle.io.SequenceSampler(dataset_val)
    batch_sampler_train = paddle.io.BatchSampler(sampler_train,
                                                 args.batch_size,
                                                 drop_last=True,
                                                 dataset=None)
    data_loader_train = DataLoader(dataset_train, batch_sampler=\
        batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.
        num_workers)
    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=\
        sampler_val, drop_last=False, collate_fn=utils.collate_fn,
        num_workers=args.num_workers)
    if args.dataset_file == 'coco_panoptic':
        coco_val = datasets.coco.build('val', args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)
    if args.frozen_weights is not None:
        checkpoint = paddle.load(args.frozen_weights)
        model_without_ddp.detr.load_state_dict(checkpoint['model'])
    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = paddle.load(args.resume)
        else:
            checkpoint = paddle.load(args.resume)
        model_without_ddp.load_state_dict(checkpoint['model'])
        if (not args.eval and 'optimizer' in checkpoint
                and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint):
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval['bbox'].eval,
                                 output_dir / 'eval.pdiparams')
        return
    print('Start training')
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pdiparams']
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pdiparams')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args
                    }, checkpoint_path)
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device,
                                              args.output_dir)
        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        if args.output_dir and utils.is_main_process():
            with (output_dir / 'log.txt').open('a') as f:
                f.write(json.dumps(log_stats) + '\n')
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if 'bbox' in coco_evaluator.coco_eval:
                    filenames = ['latest.pdiparams']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pdiparams')
                    for name in filenames:
                        paddle.save(coco_evaluator.coco_eval['bbox'].eval,
                                    output_dir / 'eval' / name)
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))