Exemple #1
0
def test_evaluate_rule_returns_None():
    class MyRule():
        def evaluate(self, ctx):
            return None
    expected_score = eng.Score(16)
    score = eng.evaluate(None, [MyRule()], init_score=expected_score)
    assert score == expected_score
Exemple #2
0
def run_test(program_rules=None, channel_airline=eng.Airline('dy'), purchase_currency='USD', program_volume=1, program_scheme="mc", channel_rules=None):
    airline = eng.Airline("dy")
    enett = eng.Program(
        code="enett",
        scheme=program_scheme,
        currencies=['USD'],
        volume=program_volume,
        rules=program_rules
    )
    dy_agency = eng.Channel(
        code="dy_agency",
        airlines=[channel_airline],
        rules=channel_rules
    )
    ctx = eng.Context(program=enett, channel=dy_agency, amount=100, airline=airline, purchase_currency=purchase_currency)
    return eng.evaluate(ctx, [enett, dy_agency])
Exemple #3
0
def main(args):
    utils.init_distributed_mode(args)

    print(args)

    if args.distillation_type != 'none' and args.finetune and not args.eval:
        raise NotImplementedError(
            "Finetuning with distillation not yet supported")

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    # random.seed(seed)

    cudnn.benchmark = True

    dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
    dataset_val, _ = build_dataset(is_train=False, args=args)

    if True:  # args.distributed:
        num_tasks = utils.get_world_size()
        global_rank = utils.get_rank()
        if args.repeated_aug:
            sampler_train = RASampler(dataset_train,
                                      num_replicas=num_tasks,
                                      rank=global_rank,
                                      shuffle=True)
        else:
            sampler_train = torch.utils.data.DistributedSampler(
                dataset_train,
                num_replicas=num_tasks,
                rank=global_rank,
                shuffle=True)
        if args.dist_eval:
            if len(dataset_val) % num_tasks != 0:
                print(
                    'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
                    'equal num of samples per-process.')
            sampler_val = torch.utils.data.DistributedSampler(
                dataset_val,
                num_replicas=num_tasks,
                rank=global_rank,
                shuffle=False)
        else:
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    data_loader_train = torch.utils.data.DataLoader(
        dataset_train,
        sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
    )

    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  sampler=sampler_val,
                                                  batch_size=int(
                                                      1.5 * args.batch_size),
                                                  num_workers=args.num_workers,
                                                  pin_memory=args.pin_mem,
                                                  drop_last=False)

    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_fn = Mixup(mixup_alpha=args.mixup,
                         cutmix_alpha=args.cutmix,
                         cutmix_minmax=args.cutmix_minmax,
                         prob=args.mixup_prob,
                         switch_prob=args.mixup_switch_prob,
                         mode=args.mixup_mode,
                         label_smoothing=args.smoothing,
                         num_classes=args.nb_classes)

    print(f"Creating model: {args.model}")
    model = create_model(
        args.model,
        num_classes=args.nb_classes,
        distillation=(args.distillation_type != 'none'),
        pretrained=args.eval,
        fuse=args.eval,
    )

    if args.finetune:
        if args.finetune.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.finetune,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.finetune, map_location='cpu')

        checkpoint_model = checkpoint['model']
        state_dict = model.state_dict()
        for k in [
                'head.weight', 'head.bias', 'head_dist.weight',
                'head_dist.bias'
        ]:
            if k in checkpoint_model and checkpoint_model[
                    k].shape != state_dict[k].shape:
                print(f"Removing key {k} from pretrained checkpoint")
                del checkpoint_model[k]

        model.load_state_dict(checkpoint_model, strict=False)

    model.to(device)

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but
        # before SyncBN and DDP wrapper
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '',
                             resume='')

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size(
    ) / 512.0
    args.lr = linear_scaled_lr
    optimizer = create_optimizer(args, model_without_ddp)
    loss_scaler = NativeScaler()

    lr_scheduler, _ = create_scheduler(args, optimizer)

    criterion = LabelSmoothingCrossEntropy()

    if args.mixup > 0.:
        # smoothing is handled with mixup label transform
        criterion = SoftTargetCrossEntropy()
    elif args.smoothing:
        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
    else:
        criterion = torch.nn.CrossEntropyLoss()

    teacher_model = None
    if args.distillation_type != 'none':
        assert args.teacher_path, 'need to specify teacher-path when using distillation'
        print(f"Creating teacher model: {args.teacher_model}")
        teacher_model = create_model(
            args.teacher_model,
            pretrained=False,
            num_classes=args.nb_classes,
            global_pool='avg',
        )
        if args.teacher_path.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.teacher_path,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.teacher_path, map_location='cpu')
        teacher_model.load_state_dict(checkpoint['model'])
        teacher_model.to(device)
        teacher_model.eval()

    # wrap the criterion in our custom DistillationLoss, which
    # just dispatches to the original criterion if args.distillation_type is
    # 'none'
    criterion = DistillationLoss(criterion, teacher_model,
                                 args.distillation_type,
                                 args.distillation_alpha,
                                 args.distillation_tau)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
            if args.model_ema:
                utils._load_checkpoint_for_ema(model_ema,
                                               checkpoint['model_ema'])
            if 'scaler' in checkpoint:
                loss_scaler.load_state_dict(checkpoint['scaler'])
    if args.eval:
        test_stats = evaluate(data_loader_val, model, device)
        print(
            f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
        )
        return

    print(f"Start training for {args.epochs} epochs")
    start_time = time.time()
    max_accuracy = 0.0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            data_loader_train.sampler.set_epoch(epoch)

        train_stats = train_one_epoch(
            model,
            criterion,
            data_loader_train,
            optimizer,
            device,
            epoch,
            loss_scaler,
            args.clip_grad,
            args.clip_mode,
            model_ema,
            mixup_fn,
            set_training_mode=args.finetune ==
            ''  # keep in eval mode during finetuning
        )

        lr_scheduler.step(epoch)
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'model_ema': get_state_dict(model_ema),
                        'scaler': loss_scaler.state_dict(),
                        'args': args,
                    }, checkpoint_path)

        if epoch % 20 == 19:
            test_stats = evaluate(data_loader_val, model, device)
            print(
                f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
            )
            max_accuracy = max(max_accuracy, test_stats["acc1"])
            print(f'Max accuracy: {max_accuracy:.2f}%')
            log_stats = {
                **{f'train_{k}': v
                   for k, v in train_stats.items()},
                **{f'test_{k}': v
                   for k, v in test_stats.items()}, 'epoch': epoch,
                'n_parameters': n_parameters
            }
        else:
            log_stats = {
                **{f'train_{k}': v
                   for k, v in train_stats.items()}, 'epoch': epoch,
                'n_parameters': n_parameters
            }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #4
0
                            lr=0.005,
                            momentum=0.9,
                            weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

###########################################################################################

# And now let's train the model for 10 epochs, evaluating at the end of every epoch.

num_epochs = 2  ######################################## No. of epochs

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

    torch.save(model.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth')
# and a learning rate scheduler which decreases the learning rate
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=15,
                                               gamma=0.1)

num_epochs = 20

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    epoch_loss = train_one_epoch(model,
                                 optimizer,
                                 data_loader,
                                 device,
                                 epoch,
                                 print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the validation dataset
    mAP, AP = evaluate(model, data_loader_val, dataset_val, device)

    writer.add_scalar('training loss', epoch_loss, epoch)
    writer.add_scalar('mAP', mAP, epoch)

    # save model per epoch
    file_name_model_epoch = SAVE_MODEL + '_' + str(epoch) + '.pth'
    torch.save(model, file_name_model_epoch)
print(f'Testseq to remember: {testing_seq_indices}')
writer.close()
torch.save(model, SAVE_MODEL)
Exemple #6
0
            targets=train_targets,
            resize=(227, 227),
            augmentation=aug
        )
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=16, shuffle=True, num_workers=4
        )
        
        valid_dataset = dataset.ClassificationDataset(
            image_paths=valid_images,
            targets=train_targets,
            resize=(227, 227),
            augmentation=aug
        )
        valid_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=16, shuffle=False, num_workers=4
        )

        # Simple Adam optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

        # train and print auc score for all epochs
        for epoch in range(epochs):
            engine.train(train_loader, model, optimizer, device=device)
            predictions, valid_targets = engine.evaluate(
                valid_loader, model, device=device
            )
            roc_auc = metrics.roc_auc_score(valid_targets, predictions)
            print(
                f"Epoch={epoch}, Valid ROC AUC={roc_auc}"
            )
Exemple #7
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Data loading code
    print("Loading data")

    dataset, num_classes = get_dataset(args.dataset, "train",
                                       get_transform(train=True),
                                       args.data_path)
    dataset_test, _ = get_dataset(args.dataset, "val",
                                  get_transform(train=False), args.data_path)

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    if args.aspect_ratio_group_factor >= 0:
        group_ids = create_aspect_ratio_groups(
            dataset, k=args.aspect_ratio_group_factor)
        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids,
                                                  args.batch_size)
    else:
        train_batch_sampler = torch.utils.data.BatchSampler(train_sampler,
                                                            args.batch_size,
                                                            drop_last=True)

    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_sampler=train_batch_sampler,
        num_workers=args.workers,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   sampler=test_sampler,
                                                   num_workers=args.workers,
                                                   collate_fn=utils.collate_fn)

    print("Creating model")
    kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers}
    if "rcnn" in args.model:
        if args.rpn_score_thresh is not None:
            kwargs["rpn_score_thresh"] = args.rpn_score_thresh
    model = torchvision.models.detection.__dict__[args.model](
        num_classes=num_classes, pretrained=args.pretrained, **kwargs)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1

    if args.test_only:
        if args.ms:
            evaluate_ms(model, data_loader_test, device=device)
        else:
            evaluate(model, data_loader_test, device=device)
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_one_epoch(model, optimizer, data_loader, device, epoch,
                        args.print_freq)
        lr_scheduler.step()
        if args.output_dir:
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'args': args,
                    'epoch': epoch
                }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))

        # evaluate after every epoch
        evaluate(model, data_loader_test, device=device)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #8
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    wandb.init(project="qpic-project",
               entity="sangbaeklee",
               group="experiment_qpic")
    wandb.config = {
        "learning_rate": args.lr,
        "epochs": args.epochs,
        "batch_size": args.batch_size,
    }

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    wandb.watch(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if not args.hoi:
        if args.dataset_file == "coco_panoptic":
            # We also evaluate AP during panoptic training, on original coco DS
            coco_val = datasets.coco.build("val", args)
            base_ds = get_coco_api_from_dataset(coco_val)
        else:
            base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    elif args.pretrained:
        checkpoint = torch.load(args.pretrained, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)

    if args.eval:
        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            return
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)
            if args.output_dir:
                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                     output_dir / "eval.pth")
            return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            coco_evaluator = None
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        #import pdb; pdb.set_trace()
        if args.dataset_file == 'hico' or args.dataset_file == 'hico_second':
            wandb.log({
                "loss": train_stats['loss'],
                "mAP": test_stats['mAP'],
                "mAP rare": test_stats['mAP rare'],
                "mAP non-rare": test_stats['mAP non-rare'],
                "mean max recall": test_stats['mean max recall']
            })
        elif args.dataset_file == 'vcoco':
            wandb.log({
                "mAP_all": test_stats['mAP_all'],
                "mAP_thesis": test_stats['mAP_thesis'],
                "AP_hold_obj": test_stats['AP_hold_obj'],
                "AP_stand": test_stats['AP_stand'],
                "AP_sit_instr": test_stats['AP_sit_instr'],
                "AP_ride_instr": test_stats['AP_ride_instr'],
                "AP_walk": test_stats['AP_walk'],
                "AP_look_obj": test_stats['AP_look_obj'],
                "AP_hit_instr": test_stats['AP_hit_instr'],
                "AP_hit_obj": test_stats['AP_hit_obj'],
                "AP_eat_obj": test_stats['AP_eat_obj'],
                "AP_eat_instr": test_stats['AP_eat_instr'],
                "AP_jump_instr": test_stats['AP_jump_instr'],
                "AP_lay_instr": test_stats['AP_lay_instr'],
                "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'],
                "AP_carry_obj": test_stats['AP_carry_obj'],
                "AP_throw_obj": test_stats['AP_throw_obj'],
                "AP_catch_obj": test_stats['AP_catch_obj'],
                "AP_cut_instr": test_stats['AP_cut_instr'],
                "AP_cut_obj": test_stats['AP_cut_obj'],
                "AP_run": test_stats['AP_run'],
                "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'],
                "AP_ski_instr": test_stats['AP_ski_instr'],
                "AP_surf_instr": test_stats['AP_surf_instr'],
                "AP_skateboard_instr": test_stats['AP_skateboard_instr'],
                "AP_smile": test_stats['AP_smile'],
                "AP_drink_instr": test_stats['AP_drink_instr'],
                "AP_kick_obj": test_stats['AP_kick_obj'],
                "AP_point_instr": test_stats['AP_point_instr'],
                "AP_read_obj": test_stats['AP_read_obj'],
                "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\
                "loss" : train_stats['loss']
            })
        else:
            continue

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #9
0
    print(f"Validation set size: {len(data_loader_val.dataset)}, n_batches: {len(data_loader_val)}")

    # model
    model = load_model(pretrained=True, aux_loss=False)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = torch.nn.DataParallel(model).to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=9,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 100
    
    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        engine.train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        engine.evaluate(model, data_loader_val, device, epoch, print_freq=10)


    # for i in range(len(dataset)):|
    #     it = dataset.__getitem__(i)
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Data loading code
    print("Loading data")

    dataset_train, num_classes = get_dataset(args.dataset, 'train',
                                             get_transform(is_train=True),
                                             args.root_path, args.use_channel)
    # iter_data = iter(dataset_train)
    # next_data = next(iter_data)
    # pdb.set_trace()
    dataset_valid, _ = get_dataset(args.dataset, 'valid',
                                   get_transform(is_train=False),
                                   args.root_path, args.use_channel)
    dataset_test, _ = get_dataset(args.dataset, 'test',
                                  get_transform(is_train=False),
                                  args.root_path, args.use_channel)

    print("Creating data loaders")
    if args.distributed:
        sampler_train = torch.utils.data.distributed.DistributedSampler(
            dataset_train)
        sampler_valid = torch.utils.data.distributed.DistributedSampler(
            dataset_valid)
        sampler_test = torch.utils.data.distributed.DistributedSampler(
            dataset_test)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_valid = torch.utils.data.SequentialSampler(dataset_valid)
        sampler_test = torch.utils.data.SequentialSampler(dataset_test)

    batchsampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                       args.batch_size,
                                                       drop_last=True)

    dataloader_train = torch.utils.data.DataLoader(
        dataset_train,
        batch_sampler=batchsampler_train,
        num_workers=args.workers,
        collate_fn=utils.collate_fn)

    dataloader_valid = torch.utils.data.DataLoader(dataset_valid,
                                                   batch_size=1,
                                                   sampler=sampler_valid,
                                                   num_workers=args.workers,
                                                   collate_fn=utils.collate_fn)

    dataloader_test = torch.utils.data.DataLoader(dataset_test,
                                                  batch_size=1,
                                                  sampler=sampler_test,
                                                  num_workers=args.workers,
                                                  collate_fn=utils.collate_fn)

    print("Creating model")
    # maskrcnn_resnet50_fpn
    model = maskrcnn_resnet50_fpn(num_classes=num_classes,
                                  pretrained=args.pretrained)
    # set iou between boxes for nms: 0.7
    model.roi_heads.nms_thresh = 0.3
    # set the max num of rois: 1000
    model.roi_heads.detections_per_img = 1000
    # default: 0.05, 0.5
    # model.roi_heads.score_thresh = 0.5
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1

    if args.test_only:
        evaluate(model,
                 dataloader_test,
                 device,
                 is_vis=args.vis,
                 draw_bbox=False,
                 vis_dir=args.vis_dir)
        return

    print("Start training")
    start_time = time.time()
    best_score = 0
    iter_count = 0
    warmup_factor = 1. / 1000
    warmup_iters = 1000
    warmup_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        iter_count, _ = train_one_epoch(model, optimizer, warmup_scheduler,
                                        dataloader_train, device, epoch,
                                        iter_count, args.print_freq)
        lr_scheduler.step()
        if args.output_dir:
            if ((epoch + 1) % 100) == 0:
                # evaluate after every epoch
                mAP_scores = evaluate(model, dataloader_valid, device=device)
                if best_score < mAP_scores['segm']:
                    best_score = mAP_scores['segm']
                    utils.save_on_master(
                        {
                            'model': model_without_ddp.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'lr_scheduler': lr_scheduler.state_dict(),
                            'args': args,
                            'epoch': epoch
                        },
                        os.path.join(args.output_dir + '_' + args.use_channel,
                                     'model_{}.pth'.format(epoch + 1)))
        # print(iter_count)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
def train(args):
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = PennFudanDataset(args.data_dir, get_transform(train=True))
    dataset_test = PennFudanDataset(args.data_dir, get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=0,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=args.test_batch_size,
        shuffle=False,
        num_workers=0,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = args.epochs
    max_map = 0
    best_model = model

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        metric_logger = train_one_epoch(model,
                                        optimizer,
                                        data_loader,
                                        device,
                                        epoch,
                                        print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        coco_evaluator = evaluate(model, data_loader_test, device=device)

        if max_map < coco_evaluator.coco_eval['bbox'].stats[0]:
            max_map = coco_evaluator.coco_eval['bbox'].stats[0]
            best_model = model

    save_model(best_model, args.model_dir)
    print("That's it!")
    return best_model, max_map
        print("Reusing last checkpoint from phase:")
        print(classifier_ckpt)
        load_tbs = utils.load_checkpoint(classifier_ckpt)
        core_model.load_state_dict(load_tbs['state_dict'])
        dataset_test = VOC('07',
                           'edgeboxes',
                           'test',
                           included=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        base_val_loader = torch.utils.data.DataLoader(
            dataset_test,
            batch_size=3,
            shuffle=False,
            num_workers=1,
            collate_fn=utils.collate_fn)
        core_model.to(device)
        evaluate(core_model, base_val_loader, device=device)
        #eval the  checkpoint to verify

    model = ModelWrapper(core_model,
                         output_layer_names=[args.extract_features_from],
                         return_single=True)

    model.eval()
    model.to(device)

    dataset = VOC('07', 'selective_search', 'trainval')
    dataset_test = VOC('07', 'selective_search', 'test')

    # define training and validation data loaders
    base_train_loader = torch.utils.data.DataLoader(
        dataset,
Exemple #13
0
def train():
    ##Train the Model
    import utils as utils
    from engine import train_one_epoch, evaluate

    os.environ['TORCH_HOME'] = './'
    device = torch.device('cpu')
    num_classes = 44  # 44 classes = 43 + background
    dataset = MyDataset(image_dic=image_dict,
                        transform=transform_data(train=True))
    dataset_test = MyDataset(image_dic=image_dict,
                             transform=transform_data(train=False))

    # split data 400:100 for train:test with dataset ~500 images
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-100])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-100:])

    data_loader_train = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=2,
        shuffle=False,
        collate_fn=utils.collate_fn)
    data_loader_test = torch.utils.data.DataLoader(dataset=dataset_test,
                                                   batch_size=2,
                                                   shuffle=False,
                                                   collate_fn=utils.collate_fn)

    # model, cần giải thích được các tham số model truyền vào
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
        pretrained=False,
        progress=True,
        num_classes=num_classes,
        pretrained_backbone=True)

    model.to(device)

    # start train
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.0005,
                                momentum=0.9,
                                weight_decay=0.0005)
    learning_rate_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=1, T_mult=2)

    epochs = 1000
    losses = []
    loss_box_reg = []
    loss_rpn_box_reg = []
    loss_classifier = []
    loss_objectness = []
    stat0 = []
    stat1 = []
    stat2 = []
    stat3 = []
    stat4 = []
    stat5 = []
    stat6 = []
    stat7 = []
    stat8 = []
    stat9 = []
    stat10 = []
    stat11 = []
    torch.set_num_threads(4)
    for epoch in range(epochs):
        metrics = train_one_epoch(model,
                                  optimizer,
                                  data_loader_train,
                                  device,
                                  epoch,
                                  print_freq=50)
        np.save(str(epoch) + 'metric.h5', metrics)

        losses.append(float(str(metrics.meters['loss']).split(" ")[0]))
        loss_box_reg.append(
            float(str(metrics.meters['loss_box_reg']).split(" ")[0]))
        loss_rpn_box_reg.append(
            float(str(metrics.meters['loss_rpn_box_reg']).split(" ")[0]))
        loss_classifier.append(
            float(str(metrics.meters['loss_classifier']).split(" ")[0]))
        loss_objectness.append(
            float(str(metrics.meters['loss_objectness']).split(" ")[0]))

        learning_rate_scheduler.step()
        # Evaluate on the test dataset
        # _ gives coco_evaL obj from coco_eval.py from CocoEvaluator()
        _, metric_logger = evaluate(model, data_loader_test, device=device)
        # Stat object is from pycocotools' self.stats in summarize()
        # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
        stat = _.coco_eval['bbox'].stats
        # Append all stats
        stat0.append(stat[0])
        stat1.append(stat[1])
        stat2.append(stat[2])
        stat3.append(stat[3])
        stat4.append(stat[4])
        stat5.append(stat[5])
        stat6.append(stat[6])
        stat7.append(stat[7])
        stat8.append(stat[8])
        stat9.append(stat[9])
        stat10.append(stat[10])
        stat11.append(stat[11])

        print('')
        print('==================================================')
        print('')
        print('')
        print('')

    print("Done!")
    print('star0', stat0)
    print('star1', stat1)
    print('star2', stat2)
    print('star3', stat3)
    print('star4', stat4)
    print('star5', stat5)
    print('star6', stat6)
    print('star7', stat7)
    print('star8', stat8)
    print('star9', stat9)
    print('star10', stat10)
    print('star11', stat11)
    print('losses', losses)
    print('loss_box_reg', loss_box_reg)
    print('loss_rpn_box_reg', loss_rpn_box_reg)
    print('loss_classifier', loss_classifier)
    print('loss_objectness', loss_objectness)
    print("Save!")
    np.save('star0', stat0)
    np.save('star1', stat1)
    np.save('star2', stat2)
    np.save('star3', stat3)
    np.save('star4', stat4)
    np.save('star5', stat5)
    np.save('star6', stat6)
    np.save('star7', stat7)
    np.save('star8', stat8)
    np.save('star9', stat9)
    np.save('star10', stat10)
    np.save('star11', stat11)
    np.save('losses', losses)
    np.save('loss_box_reg', loss_box_reg)
    np.save('loss_rpn_box_reg', loss_rpn_box_reg)
    np.save('loss_classifier', loss_classifier)
    np.save('loss_objectness', loss_objectness)

    # save
    torch.save(model, r'./train1000.pkl')

    torch.save(model.state_dict(), 'train1000.pth')
    torch.save(
        {
            'epoch': epoch,
            "model_state_dict": model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, 'ckpt1000.pth')
Exemple #14
0
def main(args):
    utils.init_distributed_mode(args)
    print('git:\n  {}\n'.format(utils.get_sha()))

    print(args)

    device = torch.device(args.device)
    print(device)
    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    if args.stage == 1:
        for name, value in model_without_ddp.named_parameters():
            if 'iou' in name:
                value.requires_grad = False
        learned_params = filter(lambda p: p.requires_grad,
                                model_without_ddp.parameters())
    elif args.stage == 2:
        for name, value in model_without_ddp.named_parameters():
            if 'class_embed' not in name:
                value.requires_grad = False
        head_params = filter(lambda p: p.requires_grad,
                             model_without_ddp.parameters())
        learned_params = list(head_params)
    else:
        for name, value in model_without_ddp.named_parameters():
            if 'iou' not in name:
                value.requires_grad = False
        head_params = filter(lambda p: p.requires_grad,
                             model_without_ddp.parameters())
        learned_params = list(head_params)

    optimizer = torch.optim.AdamW(learned_params,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=thumos.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=thumos.collate_fn,
                                 num_workers=args.num_workers)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.rtd.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            print(("=> loading checkpoint '{}'".format(args.resume)))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            pretrained_dict = checkpoint['model']
            # only resume part of model parameter
            model_dict = model_without_ddp.state_dict()
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items() if k in model_dict
            }
            model_dict.update(pretrained_dict)
            model_without_ddp.load_state_dict(model_dict)
            # main_model.load_state_dict(checkpoint['state_dict'])
            print(("=> loaded '{}' (epoch {})".format(args.resume,
                                                      checkpoint['epoch'])))

    if args.load:
        checkpoint = torch.load(args.load, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])

    if args.eval:
        evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors,
                                             data_loader_val, device, args)
        res = evaluator.summarize()

        test_stats, results_pd = eval_props(res)
        print('test_stats', test_stats)

        if args.output_dir:
            results_pd.to_csv(args.output_dir + 'results_eval.csv')
        return

    print('Start training')
    start_time = time.time()

    fig1 = plt.figure('train', figsize=(18.5, 10.5))
    ax1_train = fig1.add_subplot(231)
    ax2_train = fig1.add_subplot(232)
    ax3_train = fig1.add_subplot(233)
    ax4_train = fig1.add_subplot(234)
    ax5_train = fig1.add_subplot(235)
    ax6_train = fig1.add_subplot(236)

    axs_train = {
        'loss_ce': ax1_train,
        'loss_bbox': ax2_train,
        'loss_giou': ax3_train,
        'cardinality_error': ax4_train,
        'class_error': ax5_train,
        'loss_iou': ax6_train
    }

    fig2 = plt.figure('eval', figsize=(18.5, 10.5))
    ax1_eval = fig2.add_subplot(231)
    ax2_eval = fig2.add_subplot(232)
    ax3_eval = fig2.add_subplot(233)
    ax4_eval = fig2.add_subplot(234)
    ax5_eval = fig2.add_subplot(235)
    ax6_eval = fig2.add_subplot(236)
    axs_eval = {
        'loss_ce': ax1_eval,
        'loss_bbox': ax2_eval,
        'loss_giou': ax3_eval,
        'cardinality_error': ax4_eval,
        'class_error': ax5_eval,
        'loss_iou': ax6_eval
    }

    colordict = {
        '50': 'g',
        '100': 'b',
        '200': 'purple',
        '500': 'orange',
        '1000': 'brown'
    }

    fig3 = plt.figure('test_AR')
    axs_test = fig3.add_subplot(111)

    epoch_list = []
    train_loss_list = {}
    eval_loss_list = {}
    test_stats_list = {}
    best_ar50 = 0
    best_sum_ar = 0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        train_stats, train_loss_dict = train_one_epoch(model, criterion,
                                                       data_loader_train,
                                                       optimizer, device,
                                                       epoch, args)

        for key, value in train_loss_dict.items():
            if key in [
                    'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error',
                    'class_error', 'loss_iou'
            ]:
                try:
                    train_loss_list[key].append(value.mean())
                except KeyError:
                    train_loss_list[key] = [value.mean()]

        lr_scheduler.step()
        if epoch % 50 == 0 and args.output_dir:
            checkpoint_path = output_dir / 'checkpoint_epoch{}.pth'.format(
                epoch)
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors,
                                             data_loader_val, device, args)
        res = evaluator.summarize()

        test_stats, results_pd = eval_props(res)
        for k, v in test_stats.items():
            try:
                test_stats_list[k].append(float(v) * 100)
            except KeyError:
                test_stats_list[k] = [float(v) * 100]

        for key, value in eval_loss_dict.items():
            if key in [
                    'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error',
                    'class_error', 'loss_iou'
            ]:
                try:
                    eval_loss_list[key].append(value.mean())
                except KeyError:
                    eval_loss_list[key] = [value.mean()]

        print('test_stats', test_stats)

        # debug
        # if args.output_dir:
        #     results_pd.to_csv(args.output_dir+'results_epoch_{}.csv'.format(epoch))

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_AR@{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        if (float(test_stats['50']) > best_ar50):
            best_ar50 = float(test_stats['50'])
            with (output_dir / 'log_best_ar50.txt').open('w') as f:
                f.write(json.dumps(log_stats) + '\n')
            checkpoint_path = output_dir / 'checkpoint_best_ar50.pth'
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)
        current_sum_ar = float(test_stats['50']) + float(
            test_stats['100']) + float(test_stats['200'])
        if (current_sum_ar > best_sum_ar):
            best_sum_ar = current_sum_ar
            with (output_dir / 'log_best_sum_ar.txt').open('w') as f:
                f.write(json.dumps(log_stats) + '\n')
            checkpoint_path = output_dir / 'checkpoint_best_sum_ar.pth'
            utils.save_on_master(
                {
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        if args.output_dir and utils.is_main_process():
            with (output_dir / 'log.txt').open('a') as f:
                f.write(json.dumps(log_stats) + '\n')
        epoch_list.append(epoch)
        if epoch % 2 == 0:
            # split, loss_dict, axs, epoch, color_dict

            draw_stats(axs_test, test_stats_list, epoch_list, colordict)
            axs_test.legend()
            draw('train', train_loss_list, axs_train, epoch, 'b')
            draw('eval', eval_loss_list, axs_eval, epoch, 'g')
            fig1.savefig('train_loss_curve.jpg', dpi=300)
            fig2.savefig('eval_loss_curve.jpg', dpi=300)
            fig3.savefig('test_ar.jpg')

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #15
0
def main(args):
    if args.output_dir:
        utils.mkdir(args.output_dir)

    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    if args.use_deterministic_algorithms:
        torch.use_deterministic_algorithms(True)

    # Data loading code
    print("Loading data")

    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path)
    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path)

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    if args.aspect_ratio_group_factor >= 0:
        group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
    else:
        train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True)

    train_collate_fn = utils.collate_fn
    if args.use_copypaste:
        if args.data_augmentation != "lsj":
            raise RuntimeError("SimpleCopyPaste algorithm currently only supports the 'lsj' data augmentation policies")

        train_collate_fn = copypaste_collate_fn

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_collate_fn
    )

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
    )

    print("Creating model")
    kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers}
    if args.data_augmentation in ["multiscale", "lsj"]:
        kwargs["_skip_resize"] = True
    if "rcnn" in args.model:
        if args.rpn_score_thresh is not None:
            kwargs["rpn_score_thresh"] = args.rpn_score_thresh
    model = torchvision.models.detection.__dict__[args.model](
        weights=args.weights, weights_backbone=args.weights_backbone, num_classes=num_classes, **kwargs
    )
    model.to(device)
    if args.distributed and args.sync_bn:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.norm_weight_decay is None:
        parameters = [p for p in model.parameters() if p.requires_grad]
    else:
        param_groups = torchvision.ops._utils.split_normalization_params(model)
        wd_groups = [args.norm_weight_decay, args.weight_decay]
        parameters = [{"params": p, "weight_decay": w} for p, w in zip(param_groups, wd_groups) if p]

    opt_name = args.opt.lower()
    if opt_name.startswith("sgd"):
        optimizer = torch.optim.SGD(
            parameters,
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
            nesterov="nesterov" in opt_name,
        )
    elif opt_name == "adamw":
        optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay)
    else:
        raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD and AdamW are supported.")

    scaler = torch.cuda.amp.GradScaler() if args.amp else None

    args.lr_scheduler = args.lr_scheduler.lower()
    if args.lr_scheduler == "multisteplr":
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
    elif args.lr_scheduler == "cosineannealinglr":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs)
    else:
        raise RuntimeError(
            f"Invalid lr scheduler '{args.lr_scheduler}'. Only MultiStepLR and CosineAnnealingLR are supported."
        )

    if args.resume:
        checkpoint = torch.load(args.resume, map_location="cpu")
        model_without_ddp.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
        args.start_epoch = checkpoint["epoch"] + 1
        if args.amp:
            scaler.load_state_dict(checkpoint["scaler"])

    if args.test_only:
        torch.backends.cudnn.deterministic = True
        evaluate(model, data_loader_test, device=device)
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, scaler)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint = {
                "model": model_without_ddp.state_dict(),
                "optimizer": optimizer.state_dict(),
                "lr_scheduler": lr_scheduler.state_dict(),
                "args": args,
                "epoch": epoch,
            }
            if args.amp:
                checkpoint["scaler"] = scaler.state_dict()
            utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
            utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))

        # evaluate after every epoch
        evaluate(model, data_loader_test, device=device)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print(f"Training time {total_time_str}")
def trainModel():
    # use our dataset and defined transformations
    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

    # split the dataset in train and test set
    torch.manual_seed(1)
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=2,
                                              shuffle=True,
                                              num_workers=4,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   collate_fn=utils.collate_fn)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2

    # get the model using our helper function
    model, in_feat, hidd_layer = get_instance_segmentation_model(num_classes)
    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)

    # and a learning rate scheduler which decreases the learning rate by
    # 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    return model
        #torch.save(mask_rcnn.state_dict(), save_param)
        # train for one epoch, printing every 10 iterations
        print(save_param)
        train_one_epoch(mask_rcnn,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=100)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        #print('\n')
        #print("trained_param_4/epoch_00%02d.param" % epoch)
        #mask_rcnn.load_state_dict(torch.load("trained_param_4/epoch_00%02d.param" % epoch))
        evaluate(mask_rcnn, data_loader_test, device=device)

        #save_param = "trained_param_8_fresh/epoch_{:04d}.param".format(epoch)
        torch.save(mask_rcnn.state_dict(), save_param)
    '''

    for epoch in range(init_epoch, init_epoch + num_epochs):
        #save_param = "trained_param_3_fresh/epoch_{:04d}.param".format(epoch)
        #torch.save(mask_rcnn.state_dict(), save_param)
        # train for one epoch, printing every 10 iterations
        #train_one_epoch(mask_rcnn, optimizer, data_loader, device, epoch, print_freq=100)
        # update the learning rate
        #lr_scheduler.step()
        # evaluate on the test dataset
        print('\n')
        name = "trained_param_8/epoch_00%02d.param" % epoch
Exemple #18
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', duplication_factor=200, args=args)
    dataset_val = build_dataset(image_set='val', duplication_factor=10, args=args)

    dataset_train_size = len(dataset_train)
    dataset_val_size = len(dataset_train)
    print("training dataset size: ", dataset_train_size)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=True)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(
        sampler_train, args.batch_size, drop_last=True)

    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(
                args.resume, map_location='cpu', check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device, args.output_dir, WandbEvaluator(epoch), epoch, num_batches=(dataset_train_size // args.batch_size) )
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
        return

    # wandb monitor model during training
    wandb.config.update(args)
    # wandb.watch(model)

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch, num_batches=(dataset_train_size // args.batch_size),
            max_norm=args.clip_max_norm,
            postprocessors=postprocessors,
            wandb_evaluator=WandbEvaluator(epoch)
            )
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(
            model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, WandbEvaluator(epoch), epoch, num_batches=(dataset_val_size // args.batch_size)
        )

        log_stats = {**{f'test_{k}': v for k, v in test_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}
        wandb.log(log_stats)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #19
0
def main():
    print('___main____---')
    # 在GPU上训练,若无GPU,可选择在CPU上训练
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # 我们的数据集只有两个类 - 背景和人
    num_classes = 2
    # 使用我们的数据集和定义的转换
    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

    # 在训练和测试集中拆分数据集
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # 定义训练和验证数据加载器
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=1,
                                              shuffle=True,
                                              num_workers=1,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=1,
                                                   collate_fn=utils.collate_fn)

    # 使用我们的辅助函数获取模型
    model = get_model_instance_segmentation(num_classes)

    # 将我们的模型迁移到合适的设备
    model.to(device)

    # 构造一个优化器
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)
    # 和学习率调度程序
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # 训练10个epochs
    num_epochs = 1

    for epoch in range(num_epochs):
        # 训练一个epoch,每10次迭代打印一次
        train_one_epoch(model,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=10)
        # 更新学习速率
        lr_scheduler.step()
        # 在测试集上评价
        evaluate(model, data_loader_test, device=device)

    print("That's it!")
Exemple #20
0
def train(model, trainloader, testloader, criterion, optimizer, scheduler,
          num_epochs, device, mode, detection):
    """
    Train and evaluate a model with CPU or GPU.

    :param model: classifier to train on top of mobilenet
    :param trainloader: dataloader for trainset
    :param testloader: dataloader for testset
    :param criterion: loss function
    :param optimizer: optimization method
    :param scheduler: learning rate scheduler for adaptive learning
    :param num_epochs: number of epochs
    :param device: device to train on (CPU or GPU)
    :param mode: trainings mode
    :param detection: train object detection or classification
    :param mtcnn: MTCNN model for face detection
    :return: None
    """

    print("Training on:", device)

    if detection and mode == 'faster_rcnn':
        for epoch in range(num_epochs):
            train_one_epoch(model,
                            optimizer,
                            trainloader,
                            device,
                            epoch + 1,
                            print_freq=10)
            scheduler.step()
            print("Evaluation on trainset:")
            evaluate(model, trainloader, device=device)
            print("Evaluation on testset:")
            evaluate(model, testloader, device=device)
    else:
        # log
        loss_hist = []
        train_acc_hist = []
        test_acc_hist = []

        for epoch in range(num_epochs):
            # training
            start = time.time()
            model.train()

            for i, (img, label) in enumerate(trainloader):
                img = img.to(device)
                label = label.to(device)
                optimizer.zero_grad()
                out = model(img)
                loss = criterion(out, label)
                loss.backward()
                optimizer.step()

                # record
                curr_loss = torch.mean(loss).item()
                running_loss = (curr_loss if
                                ((i == 0) and
                                 (epoch == 0)) else running_loss + curr_loss)

            scheduler.step()

            # evaluation
            model.eval()
            running_loss /= len(trainloader)
            train_acc = evaluate_acc(model, trainloader, device)
            test_acc = evaluate_acc(model, testloader, device)
            loss_hist.append(running_loss)
            train_acc_hist.append(train_acc)
            test_acc_hist.append(test_acc)
            print(
                'epoch {} \t loss {:.5f} \t train acc {:.3f} \t test acc {:.3f} \t time {:.1f} sec'
                .format(epoch + 1, running_loss, train_acc, test_acc,
                        time.time() - start))

            # create directory
            if not os.path.exists('./plots'):
                os.mkdir('./plots')
            if not os.path.exists('./models'):
                os.mkdir('./models')
            if not os.path.exists('./logs'):
                os.mkdir('./logs')

            # create name extension
            name = '_' + mode

            # save loss plot
            plt.figure(num=None, figsize=(8, 6))
            plt.plot(loss_hist)
            plt.grid(True, which="both")
            plt.xlabel('epoch', fontsize=14)
            plt.ylabel('average loss', fontsize=14)
            plt.savefig(os.path.join('./plots', 'loss' + name + '.png'))

            # save train accuracy plot
            plt.figure(num=None, figsize=(8, 6))
            plt.plot(train_acc_hist)
            plt.grid(True, which='both')
            plt.xlabel('epoch', fontsize=14)
            plt.ylabel('accuracy', fontsize=14)
            plt.savefig(os.path.join('./plots', 'train_acc' + name + '.png'))

            # save test accuracy plot
            plt.figure(num=None, figsize=(8, 6))
            plt.plot(test_acc_hist)
            plt.grid(True, which='both')
            plt.xlabel('epoch', fontsize=14)
            plt.ylabel('accuracy', fontsize=14)
            plt.savefig(os.path.join('./plots', 'test_acc' + name + '.png'))

            # close all figures
            plt.close("all")

            # save model weights
            torch.save(
                model.to('cpu').state_dict(),
                os.path.join('./models', 'net' + name + '.pt'))

            # save logs
            file = open(os.path.join('./logs', 'log' + name + '.txt'), 'w')
            print('Final Loss:', loss_hist[-1], file=file)
            print('Final Train Accuracy:', train_acc_hist[-1], file=file)
            print('Final Test Accuracy:', test_acc_hist[-1], file=file)

            # save variables
            with open(os.path.join('./logs', 'log' + name + '.pkl'),
                      'wb') as f:
                pickle.dump([loss_hist, train_acc_hist, test_acc_hist], f)
Exemple #21
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)
    # if args.distillation_type != 'none' and args.finetune and not args.eval:
    #     raise NotImplementedError("Finetuning with distillation not yet supported")

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    # random.seed(seed)

    cudnn.benchmark = True

    print('dataset build init....')
    dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
    dataset_val, _ = build_dataset(is_train=False, args=args)
    print('dataset build finish....')

    if True:  # args.distributed:
        num_tasks = utils.get_world_size()
        global_rank = utils.get_rank()
        if args.repeated_aug:
            sampler_train = RASampler(dataset_train,
                                      num_replicas=num_tasks,
                                      rank=global_rank,
                                      shuffle=True)
        else:
            sampler_train = torch.utils.data.DistributedSampler(
                dataset_train,
                # num_replicas=num_tasks,
                num_replicas=0,
                rank=global_rank,
                shuffle=True)
        if args.dist_eval:
            if len(dataset_val) % num_tasks != 0:
                print(
                    'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
                    'equal num of samples per-process.')
            sampler_val = torch.utils.data.DistributedSampler(
                dataset_val,
                # num_replicas=num_tasks,
                num_replicas=0,
                rank=global_rank,
                shuffle=False)
        else:
            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    print('data loader init....')

    data_loader_train = torch.utils.data.DataLoader(
        dataset_train,
        sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
    )

    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  sampler=sampler_val,
                                                  batch_size=int(
                                                      1.5 * args.batch_size),
                                                  num_workers=args.num_workers,
                                                  pin_memory=args.pin_mem,
                                                  drop_last=False)

    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_fn = Mixup(mixup_alpha=args.mixup,
                         cutmix_alpha=args.cutmix,
                         cutmix_minmax=args.cutmix_minmax,
                         prob=args.mixup_prob,
                         switch_prob=args.mixup_switch_prob,
                         mode=args.mixup_mode,
                         label_smoothing=args.smoothing,
                         num_classes=args.nb_classes)

    print(f"Creating model: {args.model}")
    model = create_model(
        args.model,
        pretrained=False,
        num_classes=args.nb_classes,
        drop_rate=args.drop,
        drop_path_rate=args.drop_path,
        drop_block_rate=None,
    )

    if args.finetune:
        if args.finetune.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.finetune,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.finetune, map_location='cpu')

        if 'model' in checkpoint:
            model_without_ddp.load_state_dict(checkpoint['model'])
        else:
            model_without_ddp.load_state_dict(checkpoint)
        state_dict = model.state_dict()
        for k in [
                'head.weight', 'head.bias', 'head_dist.weight',
                'head_dist.bias'
        ]:
            if k in checkpoint_model and checkpoint_model[
                    k].shape != state_dict[k].shape:
                print(f"Removing key {k} from pretrained checkpoint")
                del checkpoint_model[k]

        # # interpolate position embedding
        # pos_embed_checkpoint = checkpoint_model['pos_embed']
        # embedding_size = pos_embed_checkpoint.shape[-1]
        # num_patches = model.patch_embed.num_patches
        # num_extra_tokens = model.pos_embed.shape[-2] - num_patches
        # # height (== width) for the checkpoint position embedding
        # orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
        # # height (== width) for the new position embedding
        # new_size = int(num_patches ** 0.5)
        # # class_token and dist_token are kept unchanged
        # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
        # # only the position tokens are interpolated
        # pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
        # pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
        # pos_tokens = torch.nn.functional.interpolate(
        #     pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
        # pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
        # new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
        # checkpoint_model['pos_embed'] = new_pos_embed

        model.load_state_dict(checkpoint_model, strict=False)

    model.to(device)

    model_ema = None
    # if args.model_ema:
    #     # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
    #     model_ema = ModelEma(
    #         model,
    #         decay=args.model_ema_decay,
    #         device='cpu' if args.model_ema_force_cpu else '',
    #         resume='')

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size(
    ) / 512.0
    args.lr = linear_scaled_lr
    optimizer = create_optimizer(args, model_without_ddp)
    loss_scaler = NativeScaler()
    lr_scheduler, _ = create_scheduler(args, optimizer)

    criterion = LabelSmoothingCrossEntropy()

    if args.mixup > 0.:
        # smoothing is handled with mixup label transform
        criterion = SoftTargetCrossEntropy()
    elif args.smoothing:
        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
    else:
        criterion = torch.nn.CrossEntropyLoss()

    # teacher_model = None
    # if args.distillation_type != 'none':
    #     assert args.teacher_path, 'need to specify teacher-path when using distillation'
    #     print(f"Creating teacher model: {args.teacher_model}")
    #     teacher_model = create_model(
    #         args.teacher_model,
    #         pretrained=False,
    #         num_classes=args.nb_classes,
    #         global_pool='avg',
    #     )
    #     if args.teacher_path.startswith('https'):
    #         checkpoint = torch.hub.load_state_dict_from_url(
    #             args.teacher_path, map_location='cpu', check_hash=True)
    #     else:
    #         checkpoint = torch.load(args.teacher_path, map_location='cpu')
    #     teacher_model.load_state_dict(checkpoint['model'])
    #     teacher_model.to(device)
    #     teacher_model.eval()

    # wrap the criterion in our custom DistillationLoss, which
    # just dispatches to the original criterion if args.distillation_type is 'none'
    # criterion = DistillationLoss(
    #     criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau
    # )
    criterion = DistillationLoss(criterion, None, 'none', 0, 0)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        if 'model' in checkpoint:
            model_without_ddp.load_state_dict(checkpoint['model'])
        else:
            model_without_ddp.load_state_dict(checkpoint)

        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
            # if args.model_ema:
            #     utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema'])
            if 'scaler' in checkpoint:
                loss_scaler.load_state_dict(checkpoint['scaler'])

    if args.eval:
        test_stats = evaluate(data_loader_val, model, device)
        print(
            f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
        )
        return

    print(f"Start training for {args.epochs} epochs")
    start_time = time.time()
    max_accuracy = 0.0
    max_epoch_dp_warm_up = 100
    if 'pvt_tiny' in args.model or 'pvt_small' in args.model:
        max_epoch_dp_warm_up = 0
    if args.start_epoch < max_epoch_dp_warm_up:
        model_without_ddp.reset_drop_path(0.0)
    for epoch in range(args.start_epoch, args.epochs):
        if args.fp32_resume and epoch > args.start_epoch + 1:
            args.fp32_resume = False
        loss_scaler._scaler = torch.cuda.amp.GradScaler(
            enabled=not args.fp32_resume)

        if epoch == max_epoch_dp_warm_up:
            model_without_ddp.reset_drop_path(args.drop_path)

        if args.distributed:
            data_loader_train.sampler.set_epoch(epoch)

        train_stats = train_one_epoch(
            model,
            criterion,
            data_loader_train,
            optimizer,
            device,
            epoch,
            loss_scaler,
            args.clip_grad,
            model_ema,
            mixup_fn,
            set_training_mode=args.finetune ==
            '',  # keep in eval mode during finetuning
            fp32=args.fp32_resume)

        lr_scheduler.step(epoch)
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        # 'model_ema': get_state_dict(model_ema),
                        'scaler': loss_scaler.state_dict(),
                        'args': args,
                    },
                    checkpoint_path)

        test_stats = evaluate(data_loader_val, model, device)
        print(
            f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
        )
        max_accuracy = max(max_accuracy, test_stats["acc1"])
        print(f'Max accuracy: {max_accuracy:.2f}%')

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
        return F.softmax(output, dim=dim)
    if loss_fn == "nll":
        return F.log_softmax(output, dim=dim)
    if loss_fn in ["bce", "wbce", "wbce1"]:
        return torch.sigmoid(output)


# In[ ]:

for epoch in range(0, 20):
  # train for one epoch, printing every 10 iterations
  train_one_epoch(model, optimizerNew, TrainLoaderNew, device, epoch, print_freq=10)
  # update the learning rate
  lr_scheduler.step()
  # evaluate on the test dataset
  evaluate(model, valLoaderNew, device=device)

  model_file = 'modelobj/fasterrcnn_model_' + str(epoch) + '.pth'
  torch.save({'modelObjectDetection_state_dict': model.state_dict()},model_file)
  print('\nSaved model to ' + model_file )

!zip -r /content/modelobj.zip /content/modelobj

from google.colab import files
files.download("/content/modelobj.zip")

'''from google.colab import drive
drive.mount('/data/')
from pathlib import Path
base_dir = ('/data/My Drive')'''
def main(args):
    utils.init_distributed_mode(args)

    print("args: {}".format(args))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # special process to control whether freeze backbone
    args.model.train_backbone = args.lr_backbone > 0

    model, criterion, postprocessors = build_model(args.model)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   args.lr_drop,
                                                   gamma=args.lr_gamma)

    dataset_train = build_dataset(
        image_set='train',
        args=args.dataset,
        model_stride=model_without_ddp.backbone.stride)
    dataset_val = build_dataset(image_set='val',
                                args=args.dataset,
                                model_stride=model_without_ddp.backbone.stride)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    benchmark_test_parser = benchmark_test.get_args_parser()
    benchmark_test_args = benchmark_test_parser.get_defaults()
    benchmark_test_args.tracker.model = args.model  # overwrite the parameters about network model
    benchmark_test_args.result_path = Path(
        os.path.join(args.output_dir, 'benchmark'))
    benchmark_test_args.dataset_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'benchmark')

    benchmark_eval_parser = benchmark_eval.get_args_parser()
    benchmark_eval_args = benchmark_eval_parser.get_defaults()
    benchmark_eval_args.tracker_path = benchmark_test_args.result_path
    best_eao = 0
    best_ar = [0, 10]  # accuracy & robustness

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        # training
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()

        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every args.model_save_step epochs
            if (epoch + 1) % args.lr_drop == 0 or (
                    epoch + 1) % args.model_save_step == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

            # hack: only inference model
            utils.save_on_master({'model': model_without_ddp.state_dict()},
                                 output_dir / 'checkpoint_only_inference.pth')

        # evalute
        val_stats = evaluate(model, criterion, postprocessors, data_loader_val,
                             device, args.output_dir)

        log_stats = {
            'epoch': epoch,
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'val_{k}': v
               for k, v in val_stats.items()}, 'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

        # evualute with benchmark
        if utils.is_main_process():
            if (
                    epoch + 1
            ) % args.benchmark_test_step == 0 and epoch > args.benchmark_start_epoch:

                tracker = build_tracker(benchmark_test_args.tracker,
                                        model=model_without_ddp,
                                        postprocessors=postprocessors)
                benchmark_test_args.model_name = "epoch" + str(epoch)
                benchmark_start_time = time.time()
                benchmark_test.main(benchmark_test_args, tracker)
                benchmark_time = time.time() - benchmark_start_time

                benchmark_eval_args.model_name = "epoch" + str(epoch)
                benchmark_eval_args.tracker_prefix = "epoch" + str(epoch)
                eval_results = benchmark_eval.main(benchmark_eval_args)
                eval_result = list(eval_results.values())[0]

                if benchmark_test_args.dataset in ['VOT2018', 'VOT2019']:
                    if args.output_dir:
                        with (output_dir /
                              str("benchmark_" + benchmark_test_args.dataset +
                                  ".txt")).open("a") as f:
                            f.write("epoch: " + str(epoch) + ", best EAO: " +
                                    str(best_eao) + ", " +
                                    json.dumps(eval_result) + "\n")

                    if best_eao < eval_result['EAO']:

                        best_eao = eval_result['EAO']

                        if args.output_dir:
                            best_eao_int = int(best_eao * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_eao_{best_eao_int:03}_only_inference.pth'
                            )

                    if best_ar[0] < eval_result['accuracy'] and best_ar[
                            1] > eval_result['robustness']:

                        best_ar[0] = eval_result['accuracy']
                        best_ar[1] = eval_result['robustness']

                        if args.output_dir:
                            best_accuracy_int = int(best_ar[0] * 1000)
                            best_robustness_int = int(best_ar[1] * 1000)

                            # record: only inference model
                            utils.save_on_master(
                                {'model': model_without_ddp.state_dict()},
                                output_dir /
                                f'checkpoint{epoch:04}_best_ar_{best_accuracy_int:03}_{best_robustness_int:03}_only_inference.pth'
                            )

                print("benchmark time: {}".format(benchmark_time))

        if args.distributed:
            torch.distributed.barrier()

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #24
0
def main(args):
    utils.init_distributed_mode(args)

    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    # random.seed(seed)

    cudnn.benchmark = True

    dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
    dataset_val, _ = build_dataset(is_train=False, args=args)

    if True:  # args.distributed:
        num_tasks = utils.get_world_size()
        global_rank = utils.get_rank()
        if args.repeated_aug:
            sampler_train = RASampler(dataset_train,
                                      num_replicas=num_tasks,
                                      rank=global_rank,
                                      shuffle=True)
        else:
            sampler_train = torch.utils.data.DistributedSampler(
                dataset_train,
                num_replicas=num_tasks,
                rank=global_rank,
                shuffle=True)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    data_loader_train = torch.utils.data.DataLoader(
        dataset_train,
        sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
    )

    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size=int(
                                                      3.0 * args.batch_size),
                                                  shuffle=False,
                                                  num_workers=args.num_workers,
                                                  pin_memory=args.pin_mem,
                                                  drop_last=False)

    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_fn = Mixup(mixup_alpha=args.mixup,
                         cutmix_alpha=args.cutmix,
                         cutmix_minmax=args.cutmix_minmax,
                         prob=args.mixup_prob,
                         switch_prob=args.mixup_switch_prob,
                         mode=args.mixup_mode,
                         label_smoothing=args.smoothing,
                         num_classes=args.nb_classes)

    print(f"Creating model: {args.model}")
    model = create_model(
        args.model,
        pretrained=False,
        num_classes=args.nb_classes,
        drop_rate=args.drop,
        drop_path_rate=args.drop_path,
        drop_block_rate=args.drop_block,
    )

    if args.finetune:
        if args.finetune.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.finetune,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.finetune, map_location='cpu')
        if 'model' in checkpoint.keys():
            checkpoint_model = checkpoint['model']
        else:
            checkpoint_model = checkpoint
        state_dict = model.state_dict()
        for k in [
                'head.weight', 'head.bias', 'head_dist.weight',
                'head_dist.bias', 'trans_cls_head.weight',
                'trans_cls_head.bias', 'conv_cls_head.weight',
                'conv_cls_head.bias'
        ]:
            if k in checkpoint_model and checkpoint_model[
                    k].shape != state_dict[k].shape:
                print(f"Removing key {k} from pretrained checkpoint")
                del checkpoint_model[k]
        if 'pos_embed' in checkpoint_model.keys():
            # interpolate position embedding
            pos_embed_checkpoint = checkpoint_model['pos_embed']
            embedding_size = pos_embed_checkpoint.shape[-1]
            num_patches = model.patch_embed.num_patches
            num_extra_tokens = model.pos_embed.shape[-2] - num_patches
            # height (== width) for the checkpoint position embedding
            orig_size = int(
                (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5)
            # height (== width) for the new position embedding
            new_size = int(num_patches**0.5)
            # class_token and dist_token are kept unchanged
            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
            # only the position tokens are interpolated
            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
                                            embedding_size).permute(
                                                0, 3, 1, 2)
            pos_tokens = torch.nn.functional.interpolate(pos_tokens,
                                                         size=(new_size,
                                                               new_size),
                                                         mode='bicubic',
                                                         align_corners=False)
            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
            checkpoint_model['pos_embed'] = new_pos_embed

        model.load_state_dict(checkpoint_model, strict=False)

    model.to(device)

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '',
                             resume='')

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    # linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0
    # args.lr = linear_scaled_lr
    optimizer = create_optimizer(args, model)
    loss_scaler = NativeScaler()

    lr_scheduler, _ = create_scheduler(args, optimizer)

    criterion = LabelSmoothingCrossEntropy()

    if args.mixup > 0.:
        # smoothing is handled with mixup label transform
        criterion = SoftTargetCrossEntropy()
    elif args.smoothing:
        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
    else:
        criterion = torch.nn.CrossEntropyLoss()

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        # pdb.set_trace()
        if 'model' in checkpoint.keys():
            model_without_ddp.load_state_dict(checkpoint['model'])
        else:
            model_without_ddp.load_state_dict(checkpoint)
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
            if args.model_ema:
                utils._load_checkpoint_for_ema(model_ema,
                                               checkpoint['model_ema'])

    if args.eval:
        test_stats = evaluate(data_loader_val, model, device)
        print(
            f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
        )
        return

    print("Start training")
    start_time = time.time()
    max_accuracy = 0.0
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            data_loader_train.sampler.set_epoch(epoch)

        train_stats = train_one_epoch(
            model,
            criterion,
            data_loader_train,
            optimizer,
            device,
            epoch,
            loss_scaler,
            args.clip_grad,
            model_ema,
            mixup_fn,
            set_training_mode=args.finetune ==
            ''  # keep in eval mode during finetuning
        )

        lr_scheduler.step(epoch)
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'model_ema': get_state_dict(model_ema),
                        'args': args,
                    }, checkpoint_path)
        if epoch % args.evaluate_freq == 0:
            test_stats = evaluate(data_loader_val, model, device)
            print(
                f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%"
            )
            max_accuracy = max(max_accuracy, test_stats["acc1"])
            print(f'Max accuracy: {max_accuracy:.2f}%')

            log_stats = {
                **{f'train_{k}': v
                   for k, v in train_stats.items()},
                **{f'test_{k}': v
                   for k, v in test_stats.items()}, 'epoch': epoch,
                'n_parameters': n_parameters
            }

            if args.output_dir and utils.is_main_process():
                with (output_dir / "log.txt").open("a") as f:
                    f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #25
0
    indices = [random.randint(0, len(DS_eval) - 1) for _ in range(5)]
    img_arr = [DS_eval.get_img(_) for _ in indices]
    DS_eval = data.Subset(DS_eval, indices)

    img, label = [], []
    for i, l in DS_eval:
        img.append(i)
        label.append(l)
    DS_eval = EvalSet(img, label, img_arr)
    DL_eval = data.DataLoader(DS_eval,
                              batch_size=5,
                              shuffle=False,
                              collate_fn=utils.collate_fn,
                              num_workers=12)

    evaluate(net, DL_eval, device)

    predictions = []
    net.to(device)
    net.eval()
    with torch.no_grad():
        image, targets = next(iter(DL_eval))
        print(image[0].shape)
        image = list(img.to(device) for img in image)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        predictions = net(image)  # Returns losses and detections
        final_img = []
        ans = []
        for i in range(len(image)):
            xy_lst = predictions[i]['boxes'].to(torch.device("cpu")).numpy()
            print(xy_lst)
Exemple #26
0
#                                                gamma=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                          factor=0.3,
                                                          patience=5)

num_epochs = 100
for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_metric_logger = train_one_epoch(model,
                                          optimizer,
                                          data_loader,
                                          device,
                                          epoch,
                                          print_freq=10)
    content = f"epoch: {epoch:2d}; loss = {train_metric_logger.meters['loss']}; loss_mask = {train_metric_logger.meters['loss_mask']}"
    print(content)
    logger.info(content)
    # update the learning rate
    # lr_scheduler.step()
    # evaluate on the test dataset
    coco_evaluator, test_metric_logger = evaluate(model,
                                                  data_loader_test,
                                                  device=device)
    content = f"epoch: {epoch:2d}; loss = {test_metric_logger.meters['model_time']}; loss_mask = {test_metric_logger.meters['evaluator_time']}"
    print(content)
    logger.info(content)
    torch.save(
        model.state_dict(),
        '/root/code/model_state/mask_rcnn/mask_rcnn_0409_%d.pth' % epoch)
Exemple #27
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        if args.cache_mode:
            sampler_train = samplers.NodeDistributedSampler(dataset_train)
            sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
        else:
            sampler_train = samplers.DistributedSampler(dataset_train)
            sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(
        sampler_train, args.batch_size, drop_last=True)

    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn, num_workers=args.num_workers,
                                   pin_memory=True)
    data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
                                 pin_memory=True)

    # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"]
    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    for n, p in model_without_ddp.named_parameters():
        print(n)

    param_dicts = [
        {
            "params":
                [p for n, p in model_without_ddp.named_parameters()
                 if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n,
                                                                                                   args.lr_linear_proj_names) and p.requires_grad],
            "lr": args.lr,
        },
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if
                       match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
            "lr": args.lr_backbone,
        },
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if
                       match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
            "lr": args.lr * args.lr_linear_proj_mult,
        }
    ]
    if args.sgd:
        optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9,
                                    weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                                      weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(
                args.resume, map_location='cpu', check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
        if len(missing_keys) > 0:
            print('Missing Keys: {}'.format(missing_keys))
        if len(unexpected_keys) > 0:
            print('Unexpected Keys: {}'.format(unexpected_keys))
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            import copy
            p_groups = copy.deepcopy(optimizer.param_groups)
            optimizer.load_state_dict(checkpoint['optimizer'])
            for pg, pg_old in zip(optimizer.param_groups, p_groups):
                pg['lr'] = pg_old['lr']
                pg['initial_lr'] = pg_old['initial_lr']
            print(optimizer.param_groups)
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
            args.override_resumed_lr_drop = True
            if args.override_resumed_lr_drop:
                print(
                    'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
                lr_scheduler.step_size = args.lr_drop
                lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
            lr_scheduler.step(lr_scheduler.last_epoch)
            args.start_epoch = checkpoint['epoch'] + 1
        # check the resumed model
        if not args.eval:
            test_stats, coco_evaluator = evaluate(
                model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
            )

    if args.eval:
        test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
                                              data_loader_val, base_ds, device, args.output_dir)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 5 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(
            model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
        )

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     **{f'test_{k}': v for k, v in test_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #28
0
def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('data/PennFudanPed',
                                    get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=2,
                                              shuffle=True,
                                              num_workers=4,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   collate_fn=utils.collate_fn)

    # get the model using our helper function
    model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model,
                        optimizer,
                        data_loader,
                        device,
                        epoch,
                        print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")
    torch.save(model.state_dict(), 'data/PennFudanPed/weight.pt')
    print('saved weights of the model')
Exemple #29
0
def main(args):
    writer = SummaryWriter(log_dir=f'logs/{args.comment}', filename_suffix='')
    test_cuda()

    writer.add_hparams(args.__dict__, {})

    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if args.dataset_file == "coco_panoptic":
        # We also evaluate AP during panoptic training, on original coco DS
        coco_val = datasets.coco.build("val", args)
        base_ds = get_coco_api_from_dataset(coco_val)
    else:
        base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1

    if args.eval:
        test_stats, coco_evaluator = evaluate(model,
                                              criterion,
                                              postprocessors,
                                              data_loader_val,
                                              base_ds,
                                              device,
                                              args.output_dir,
                                              epoch=0,
                                              writer=writer,
                                              args=args)
        if args.output_dir:
            utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                 output_dir / "eval.pth")
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model,
                                      criterion,
                                      data_loader_train,
                                      optimizer,
                                      device,
                                      epoch,
                                      args.clip_max_norm,
                                      writer=writer,
                                      args=args)
        for key, val in train_stats.items():
            writer.add_scalar('train/' + key, val, epoch)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / f'checkpoint_{args.comment}.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(
                    output_dir / f'checkpoint{epoch:04}_{args.comment}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        test_stats, coco_evaluator = evaluate(model,
                                              criterion,
                                              postprocessors,
                                              data_loader_val,
                                              base_ds,
                                              device,
                                              args.output_dir,
                                              epoch=epoch,
                                              writer=writer,
                                              args=args)
        for key, val in test_stats.items():
            writer.add_scalar('test/' + key, val, epoch)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #30
0
def train_mydata():
    # use the PennFudan dataset and defined transformations
    dataset = PennFudanDataset(dataset_dir, get_transform(train=True))
    dataset_test = PennFudanDataset(dataset_dir, get_transform(train=False))

    # split the dataset in train and test set
    torch.manual_seed(1)
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=4,
        collate_fn=utils.collate_fn)
    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=4,
        collate_fn=utils.collate_fn)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # the dataset has two classes only - background and person
    num_classes = 2

    # 只改输出类别数
    # model = get_instance_segmentation_model(num_classes)

    # 更换backbone
    model = maskrcnn_resnet18_fpn(num_classes)
    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)

    # the learning rate scheduler decreases the learning rate by 10x every 3 epochs
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)
    # training
    num_epochs = 10
    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)

        # update the learning rate
        lr_scheduler.step()

        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    # test
    # put the model in evaluation mode
    model.eval()
    for i in range(10):
        img, _ = dataset_test[i]
        with torch.no_grad():
            prediction = model([img.to(device)])
        src = img.mul(255).permute(1, 2, 0).byte().numpy()
        result = prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy()
        result = np.expand_dims(result, -1).repeat(3, axis=-1)
        result = cv2.addWeighted(src, 0.5, result, 0.5, 0)
        cv2.imshow("result", result)
        cv2.waitKey(0)
Exemple #31
0
def train(fold):
    training_data_path = "../input/jpeg/train/"
    model_path = "../input/"
    df = pd.read_csv("../input/train_folds.csv")
    device = "cpu"
    epochs = 50
    train_bs = 32
    valid_bs = 16
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    

    
    train_aug = albumentations.Compose(
        [
            albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True),
            albumentations.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15),
            albumentations.Flip(p=0.5)
        ]
    )

    valid_aug = albumentations.Compose(
        [
            albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)
        ]
    )

    train_images = df_train.image_name.values.tolist()
    train_images = [os.path.join(training_data_path, i + ".jpg") for i in train_images]
    train_targets = df_train.target.values

    valid_images = df_valid.image_name.values.tolist()
    valid_images = [os.path.join(training_data_path, i + ".jpg") for i in valid_images]
    valid_targets = df_valid.target.values

    train_dataset = ClassificationDataset(
        image_paths=train_images,
        targets=train_targets,
        resize=None,
        augmentations=train_aug,
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=train_bs, shuffle=True, num_workers=4
    )

    valid_dataset = ClassificationDataset(
        image_paths=valid_images,
        targets=valid_targets,
        resize=None,
        augmentations=valid_aug,
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=valid_bs, shuffle=False, num_workers=4
    )

    model = SEResNext50_32x4d(pretrained="imagenet")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=3,
        threshold=0.001,
        mode="max"
    )
   

    es = EarlyStopping(patience=5, mode="max")

    for epoch in range(epochs):
        train_loss = engine.train(train_loader, model, optimizer, device=device)
        predictions, valid_loss = engine.evaluate(
            train_loader, model, device=device
        )
        predictions = np.vstack((predictions)).ravel()
        auc = metrics.roc_auc_score(valid_targets, predictions)
        scheduler.step(auc)
        print(f"Epoch = {epoch}, AUC = {auc}")
        scheduler.step(auc)

        es(auc, model, model_path=f"model_fold_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
Exemple #32
0
def main(args):

    print(args)
    # batch_size = 48 #72
    # num_workers = 8
    # num_classes = None
    cudnn.benchmark = True

    index_dataset = GLDDataset(root='../DATA/train',
                               input_size=224,
                               subset='index')
    test_dataset = GLDDataset(root='../DATA/test_1k_final/test_1k_final',
                              input_size=224,
                              subset='test')
    train_dataset = GLDDataset(root='../../data/train',
                               input_size=224,
                               subset='train')
    val_dataset = GLDDataset(root='../../data/train',
                             input_size=224,
                             subset='val')
    train_sample_list = train_dataset.gen_train_sample_list()
    sampler = WeightedRandomSampler(weights=train_sample_list,
                                    num_samples=1300000,
                                    replacement=False)
    index_dataloader = DataLoader(index_dataset,
                                  batch_size=128,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  drop_last=False,
                                  pin_memory=True
                                  # worker_init_fn=_worker_init_fn_()
                                  )

    test_dataloader = DataLoader(test_dataset,
                                 batch_size=128,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 drop_last=False,
                                 pin_memory=True
                                 # worker_init_fn=_worker_init_fn_()
                                 )
    train_dataloader = DataLoader(
        train_dataset,
        # sampler=sampler,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        drop_last=False,
        pin_memory=True
        # worker_init_fn=_worker_init_fn_()
    )
    val_dataloader = DataLoader(
        val_dataset,
        # sampler=test_sampler,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        drop_last=False,
        pin_memory=True
        # worker_init_fn=_worker_init_fn_()
    )
    # model = create_model(
    #     args.model,
    #     pretrained=True,
    #     num_classes=num_classes,
    #     drop_rate=args.drop,
    #     drop_path_rate=args.drop_path,
    #     drop_block_rate=None,
    # )
    model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=81313)

    if args.resume:
        checkpoint = torch.load(args.model_path, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        args.start_epoch = checkpoint['epoch'] + 1


#     model = nn.DataParallel(model)
    model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.opt_eps)

    criterion_contra = CosContrastiveLoss(margin=0.4)
    criterion = torch.nn.CrossEntropyLoss()

    output_dir = Path(args.output_dir)  #.mkdir(parents=True, exist_ok=True)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    print(f"Start training for {args.epochs} epochs")
    start_time = time.time()
    max_accuracy = 0.0

    for epoch in range(args.start_epoch, args.epochs):
        train_stats = train_one_epoch_contra(model, criterion,
                                             criterion_contra,
                                             train_dataloader, optimizer,
                                             epoch, args.epochs)
        val_stats = evaluate(val_dataloader, model)
        test_stats = test_retrieval(index_dataloader, test_dataloader, model)
        print(
            f"Accuracy of the network on the {len(val_dataset)} test images: {val_stats['acc1']:.2f}%"
        )
        #         if test_stats["acc1"] > max_accuracy:
        if True:
            if args.output_dir:
                checkpoint_paths = [
                    output_dir /
                    'checkpoint_e{}_{}.pth'.format(epoch, val_stats["acc1"])
                ]
                for checkpoint_path in checkpoint_paths:
                    #                     torch.save(model.state_dict(), checkpoint_path)
                    torch.save(
                        {
                            'model': model.state_dict(),
                            #                         'model': model.module.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            # 'lr_scheduler': lr_scheduler.state_dict(),
                            'epoch': epoch,
                            'args': args,
                        },
                        checkpoint_path)
        max_accuracy = max(max_accuracy, val_stats["acc1"])
        print(f'Max accuracy: {max_accuracy:.2f}%')

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'val_{k}': v
               for k, v in val_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch
        }

        if args.output_dir:
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")