def test_evaluate_rule_returns_None(): class MyRule(): def evaluate(self, ctx): return None expected_score = eng.Score(16) score = eng.evaluate(None, [MyRule()], init_score=expected_score) assert score == expected_score
def run_test(program_rules=None, channel_airline=eng.Airline('dy'), purchase_currency='USD', program_volume=1, program_scheme="mc", channel_rules=None): airline = eng.Airline("dy") enett = eng.Program( code="enett", scheme=program_scheme, currencies=['USD'], volume=program_volume, rules=program_rules ) dy_agency = eng.Channel( code="dy_agency", airlines=[channel_airline], rules=channel_rules ) ctx = eng.Context(program=enett, channel=dy_agency, amount=100, airline=airline, purchase_currency=purchase_currency) return eng.evaluate(ctx, [enett, dy_agency])
def main(args): utils.init_distributed_mode(args) print(args) if args.distillation_type != 'none' and args.finetune and not args.eval: raise NotImplementedError( "Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int( 1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, num_classes=args.nb_classes, distillation=(args.distillation_type != 'none'), pretrained=args.eval, fuse=args.eval, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but # before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() teacher_model = None if args.distillation_type != 'none': assert args.teacher_path, 'need to specify teacher-path when using distillation' print(f"Creating teacher model: {args.teacher_model}") teacher_model = create_model( args.teacher_model, pretrained=False, num_classes=args.nb_classes, global_pool='avg', ) if args.teacher_path.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.teacher_path, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.teacher_path, map_location='cpu') teacher_model.load_state_dict(checkpoint['model']) teacher_model.to(device) teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is # 'none' criterion = DistillationLoss(criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, args.clip_mode, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) if epoch % 20 == 19: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) ########################################################################################### # And now let's train the model for 10 epochs, evaluating at the end of every epoch. num_epochs = 2 ######################################## No. of epochs for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) torch.save(model.state_dict(), dir_checkpoint + f'CP_epoch{epoch + 1}.pth')
# and a learning rate scheduler which decreases the learning rate lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations epoch_loss = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the validation dataset mAP, AP = evaluate(model, data_loader_val, dataset_val, device) writer.add_scalar('training loss', epoch_loss, epoch) writer.add_scalar('mAP', mAP, epoch) # save model per epoch file_name_model_epoch = SAVE_MODEL + '_' + str(epoch) + '.pth' torch.save(model, file_name_model_epoch) print(f'Testseq to remember: {testing_seq_indices}') writer.close() torch.save(model, SAVE_MODEL)
targets=train_targets, resize=(227, 227), augmentation=aug ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=16, shuffle=True, num_workers=4 ) valid_dataset = dataset.ClassificationDataset( image_paths=valid_images, targets=train_targets, resize=(227, 227), augmentation=aug ) valid_loader = torch.utils.data.DataLoader( train_dataset, batch_size=16, shuffle=False, num_workers=4 ) # Simple Adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) # train and print auc score for all epochs for epoch in range(epochs): engine.train(train_loader, model, optimizer, device=device) predictions, valid_targets = engine.evaluate( valid_loader, model, device=device ) roc_auc = metrics.roc_auc_score(valid_targets, predictions) print( f"Epoch={epoch}, Valid ROC AUC={roc_auc}" )
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups( dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers} if "rcnn" in args.model: if args.rpn_score_thresh is not None: kwargs["rpn_score_thresh"] = args.rpn_score_thresh model = torchvision.models.detection.__dict__[args.model]( num_classes=num_classes, pretrained=args.pretrained, **kwargs) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: if args.ms: evaluate_ms(model, data_loader_test, device=device) else: evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico' or args.dataset_file == 'hico_second': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
print(f"Validation set size: {len(data_loader_val.dataset)}, n_batches: {len(data_loader_val)}") # model model = load_model(pretrained=True, aux_loss=False) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = torch.nn.DataParallel(model).to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=9, gamma=0.1) # let's train it for 10 epochs num_epochs = 100 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations engine.train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset engine.evaluate(model, data_loader_val, device, epoch, print_freq=10) # for i in range(len(dataset)):| # it = dataset.__getitem__(i)
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print("Loading data") dataset_train, num_classes = get_dataset(args.dataset, 'train', get_transform(is_train=True), args.root_path, args.use_channel) # iter_data = iter(dataset_train) # next_data = next(iter_data) # pdb.set_trace() dataset_valid, _ = get_dataset(args.dataset, 'valid', get_transform(is_train=False), args.root_path, args.use_channel) dataset_test, _ = get_dataset(args.dataset, 'test', get_transform(is_train=False), args.root_path, args.use_channel) print("Creating data loaders") if args.distributed: sampler_train = torch.utils.data.distributed.DistributedSampler( dataset_train) sampler_valid = torch.utils.data.distributed.DistributedSampler( dataset_valid) sampler_test = torch.utils.data.distributed.DistributedSampler( dataset_test) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_valid = torch.utils.data.SequentialSampler(dataset_valid) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batchsampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) dataloader_train = torch.utils.data.DataLoader( dataset_train, batch_sampler=batchsampler_train, num_workers=args.workers, collate_fn=utils.collate_fn) dataloader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=1, sampler=sampler_valid, num_workers=args.workers, collate_fn=utils.collate_fn) dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=sampler_test, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") # maskrcnn_resnet50_fpn model = maskrcnn_resnet50_fpn(num_classes=num_classes, pretrained=args.pretrained) # set iou between boxes for nms: 0.7 model.roi_heads.nms_thresh = 0.3 # set the max num of rois: 1000 model.roi_heads.detections_per_img = 1000 # default: 0.05, 0.5 # model.roi_heads.score_thresh = 0.5 model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, dataloader_test, device, is_vis=args.vis, draw_bbox=False, vis_dir=args.vis_dir) return print("Start training") start_time = time.time() best_score = 0 iter_count = 0 warmup_factor = 1. / 1000 warmup_iters = 1000 warmup_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) iter_count, _ = train_one_epoch(model, optimizer, warmup_scheduler, dataloader_train, device, epoch, iter_count, args.print_freq) lr_scheduler.step() if args.output_dir: if ((epoch + 1) % 100) == 0: # evaluate after every epoch mAP_scores = evaluate(model, dataloader_valid, device=device) if best_score < mAP_scores['segm']: best_score = mAP_scores['segm'] utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir + '_' + args.use_channel, 'model_{}.pth'.format(epoch + 1))) # print(iter_count) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def train(args): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations dataset = PennFudanDataset(args.data_dir, get_transform(train=True)) dataset_test = PennFudanDataset(args.data_dir, get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=args.test_batch_size, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = args.epochs max_map = 0 best_model = model for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations metric_logger = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_evaluator = evaluate(model, data_loader_test, device=device) if max_map < coco_evaluator.coco_eval['bbox'].stats[0]: max_map = coco_evaluator.coco_eval['bbox'].stats[0] best_model = model save_model(best_model, args.model_dir) print("That's it!") return best_model, max_map
print("Reusing last checkpoint from phase:") print(classifier_ckpt) load_tbs = utils.load_checkpoint(classifier_ckpt) core_model.load_state_dict(load_tbs['state_dict']) dataset_test = VOC('07', 'edgeboxes', 'test', included=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) base_val_loader = torch.utils.data.DataLoader( dataset_test, batch_size=3, shuffle=False, num_workers=1, collate_fn=utils.collate_fn) core_model.to(device) evaluate(core_model, base_val_loader, device=device) #eval the checkpoint to verify model = ModelWrapper(core_model, output_layer_names=[args.extract_features_from], return_single=True) model.eval() model.to(device) dataset = VOC('07', 'selective_search', 'trainval') dataset_test = VOC('07', 'selective_search', 'test') # define training and validation data loaders base_train_loader = torch.utils.data.DataLoader( dataset,
def train(): ##Train the Model import utils as utils from engine import train_one_epoch, evaluate os.environ['TORCH_HOME'] = './' device = torch.device('cpu') num_classes = 44 # 44 classes = 43 + background dataset = MyDataset(image_dic=image_dict, transform=transform_data(train=True)) dataset_test = MyDataset(image_dic=image_dict, transform=transform_data(train=False)) # split data 400:100 for train:test with dataset ~500 images indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-100]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-100:]) data_loader_train = torch.utils.data.DataLoader( dataset=dataset, batch_size=2, shuffle=False, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=2, shuffle=False, collate_fn=utils.collate_fn) # model, cần giải thích được các tham số model truyền vào model = torchvision.models.detection.fasterrcnn_resnet50_fpn( pretrained=False, progress=True, num_classes=num_classes, pretrained_backbone=True) model.to(device) # start train params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005) learning_rate_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=1, T_mult=2) epochs = 1000 losses = [] loss_box_reg = [] loss_rpn_box_reg = [] loss_classifier = [] loss_objectness = [] stat0 = [] stat1 = [] stat2 = [] stat3 = [] stat4 = [] stat5 = [] stat6 = [] stat7 = [] stat8 = [] stat9 = [] stat10 = [] stat11 = [] torch.set_num_threads(4) for epoch in range(epochs): metrics = train_one_epoch(model, optimizer, data_loader_train, device, epoch, print_freq=50) np.save(str(epoch) + 'metric.h5', metrics) losses.append(float(str(metrics.meters['loss']).split(" ")[0])) loss_box_reg.append( float(str(metrics.meters['loss_box_reg']).split(" ")[0])) loss_rpn_box_reg.append( float(str(metrics.meters['loss_rpn_box_reg']).split(" ")[0])) loss_classifier.append( float(str(metrics.meters['loss_classifier']).split(" ")[0])) loss_objectness.append( float(str(metrics.meters['loss_objectness']).split(" ")[0])) learning_rate_scheduler.step() # Evaluate on the test dataset # _ gives coco_evaL obj from coco_eval.py from CocoEvaluator() _, metric_logger = evaluate(model, data_loader_test, device=device) # Stat object is from pycocotools' self.stats in summarize() # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py stat = _.coco_eval['bbox'].stats # Append all stats stat0.append(stat[0]) stat1.append(stat[1]) stat2.append(stat[2]) stat3.append(stat[3]) stat4.append(stat[4]) stat5.append(stat[5]) stat6.append(stat[6]) stat7.append(stat[7]) stat8.append(stat[8]) stat9.append(stat[9]) stat10.append(stat[10]) stat11.append(stat[11]) print('') print('==================================================') print('') print('') print('') print("Done!") print('star0', stat0) print('star1', stat1) print('star2', stat2) print('star3', stat3) print('star4', stat4) print('star5', stat5) print('star6', stat6) print('star7', stat7) print('star8', stat8) print('star9', stat9) print('star10', stat10) print('star11', stat11) print('losses', losses) print('loss_box_reg', loss_box_reg) print('loss_rpn_box_reg', loss_rpn_box_reg) print('loss_classifier', loss_classifier) print('loss_objectness', loss_objectness) print("Save!") np.save('star0', stat0) np.save('star1', stat1) np.save('star2', stat2) np.save('star3', stat3) np.save('star4', stat4) np.save('star5', stat5) np.save('star6', stat6) np.save('star7', stat7) np.save('star8', stat8) np.save('star9', stat9) np.save('star10', stat10) np.save('star11', stat11) np.save('losses', losses) np.save('loss_box_reg', loss_box_reg) np.save('loss_rpn_box_reg', loss_rpn_box_reg) np.save('loss_classifier', loss_classifier) np.save('loss_objectness', loss_objectness) # save torch.save(model, r'./train1000.pkl') torch.save(model.state_dict(), 'train1000.pth') torch.save( { 'epoch': epoch, "model_state_dict": model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, 'ckpt1000.pth')
def main(args): utils.init_distributed_mode(args) print('git:\n {}\n'.format(utils.get_sha())) print(args) device = torch.device(args.device) print(device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) if args.stage == 1: for name, value in model_without_ddp.named_parameters(): if 'iou' in name: value.requires_grad = False learned_params = filter(lambda p: p.requires_grad, model_without_ddp.parameters()) elif args.stage == 2: for name, value in model_without_ddp.named_parameters(): if 'class_embed' not in name: value.requires_grad = False head_params = filter(lambda p: p.requires_grad, model_without_ddp.parameters()) learned_params = list(head_params) else: for name, value in model_without_ddp.named_parameters(): if 'iou' not in name: value.requires_grad = False head_params = filter(lambda p: p.requires_grad, model_without_ddp.parameters()) learned_params = list(head_params) optimizer = torch.optim.AdamW(learned_params, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=thumos.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=thumos.collate_fn, num_workers=args.num_workers) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.rtd.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] pretrained_dict = checkpoint['model'] # only resume part of model parameter model_dict = model_without_ddp.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict) model_without_ddp.load_state_dict(model_dict) # main_model.load_state_dict(checkpoint['state_dict']) print(("=> loaded '{}' (epoch {})".format(args.resume, checkpoint['epoch']))) if args.load: checkpoint = torch.load(args.load, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors, data_loader_val, device, args) res = evaluator.summarize() test_stats, results_pd = eval_props(res) print('test_stats', test_stats) if args.output_dir: results_pd.to_csv(args.output_dir + 'results_eval.csv') return print('Start training') start_time = time.time() fig1 = plt.figure('train', figsize=(18.5, 10.5)) ax1_train = fig1.add_subplot(231) ax2_train = fig1.add_subplot(232) ax3_train = fig1.add_subplot(233) ax4_train = fig1.add_subplot(234) ax5_train = fig1.add_subplot(235) ax6_train = fig1.add_subplot(236) axs_train = { 'loss_ce': ax1_train, 'loss_bbox': ax2_train, 'loss_giou': ax3_train, 'cardinality_error': ax4_train, 'class_error': ax5_train, 'loss_iou': ax6_train } fig2 = plt.figure('eval', figsize=(18.5, 10.5)) ax1_eval = fig2.add_subplot(231) ax2_eval = fig2.add_subplot(232) ax3_eval = fig2.add_subplot(233) ax4_eval = fig2.add_subplot(234) ax5_eval = fig2.add_subplot(235) ax6_eval = fig2.add_subplot(236) axs_eval = { 'loss_ce': ax1_eval, 'loss_bbox': ax2_eval, 'loss_giou': ax3_eval, 'cardinality_error': ax4_eval, 'class_error': ax5_eval, 'loss_iou': ax6_eval } colordict = { '50': 'g', '100': 'b', '200': 'purple', '500': 'orange', '1000': 'brown' } fig3 = plt.figure('test_AR') axs_test = fig3.add_subplot(111) epoch_list = [] train_loss_list = {} eval_loss_list = {} test_stats_list = {} best_ar50 = 0 best_sum_ar = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats, train_loss_dict = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args) for key, value in train_loss_dict.items(): if key in [ 'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error', 'class_error', 'loss_iou' ]: try: train_loss_list[key].append(value.mean()) except KeyError: train_loss_list[key] = [value.mean()] lr_scheduler.step() if epoch % 50 == 0 and args.output_dir: checkpoint_path = output_dir / 'checkpoint_epoch{}.pth'.format( epoch) utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) evaluator, eval_loss_dict = evaluate(model, criterion, postprocessors, data_loader_val, device, args) res = evaluator.summarize() test_stats, results_pd = eval_props(res) for k, v in test_stats.items(): try: test_stats_list[k].append(float(v) * 100) except KeyError: test_stats_list[k] = [float(v) * 100] for key, value in eval_loss_dict.items(): if key in [ 'loss_ce', 'loss_bbox', 'loss_giou', 'cardinality_error', 'class_error', 'loss_iou' ]: try: eval_loss_list[key].append(value.mean()) except KeyError: eval_loss_list[key] = [value.mean()] print('test_stats', test_stats) # debug # if args.output_dir: # results_pd.to_csv(args.output_dir+'results_epoch_{}.csv'.format(epoch)) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_AR@{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if (float(test_stats['50']) > best_ar50): best_ar50 = float(test_stats['50']) with (output_dir / 'log_best_ar50.txt').open('w') as f: f.write(json.dumps(log_stats) + '\n') checkpoint_path = output_dir / 'checkpoint_best_ar50.pth' utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) current_sum_ar = float(test_stats['50']) + float( test_stats['100']) + float(test_stats['200']) if (current_sum_ar > best_sum_ar): best_sum_ar = current_sum_ar with (output_dir / 'log_best_sum_ar.txt').open('w') as f: f.write(json.dumps(log_stats) + '\n') checkpoint_path = output_dir / 'checkpoint_best_sum_ar.pth' utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.output_dir and utils.is_main_process(): with (output_dir / 'log.txt').open('a') as f: f.write(json.dumps(log_stats) + '\n') epoch_list.append(epoch) if epoch % 2 == 0: # split, loss_dict, axs, epoch, color_dict draw_stats(axs_test, test_stats_list, epoch_list, colordict) axs_test.legend() draw('train', train_loss_list, axs_train, epoch, 'b') draw('eval', eval_loss_list, axs_eval, epoch, 'g') fig1.savefig('train_loss_curve.jpg', dpi=300) fig2.savefig('eval_loss_curve.jpg', dpi=300) fig3.savefig('test_ar.jpg') total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) device = torch.device(args.device) if args.use_deterministic_algorithms: torch.use_deterministic_algorithms(True) # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) train_collate_fn = utils.collate_fn if args.use_copypaste: if args.data_augmentation != "lsj": raise RuntimeError("SimpleCopyPaste algorithm currently only supports the 'lsj' data augmentation policies") train_collate_fn = copypaste_collate_fn data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_collate_fn ) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn ) print("Creating model") kwargs = {"trainable_backbone_layers": args.trainable_backbone_layers} if args.data_augmentation in ["multiscale", "lsj"]: kwargs["_skip_resize"] = True if "rcnn" in args.model: if args.rpn_score_thresh is not None: kwargs["rpn_score_thresh"] = args.rpn_score_thresh model = torchvision.models.detection.__dict__[args.model]( weights=args.weights, weights_backbone=args.weights_backbone, num_classes=num_classes, **kwargs ) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module if args.norm_weight_decay is None: parameters = [p for p in model.parameters() if p.requires_grad] else: param_groups = torchvision.ops._utils.split_normalization_params(model) wd_groups = [args.norm_weight_decay, args.weight_decay] parameters = [{"params": p, "weight_decay": w} for p, w in zip(param_groups, wd_groups) if p] opt_name = args.opt.lower() if opt_name.startswith("sgd"): optimizer = torch.optim.SGD( parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov="nesterov" in opt_name, ) elif opt_name == "adamw": optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay) else: raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD and AdamW are supported.") scaler = torch.cuda.amp.GradScaler() if args.amp else None args.lr_scheduler = args.lr_scheduler.lower() if args.lr_scheduler == "multisteplr": lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) elif args.lr_scheduler == "cosineannealinglr": lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) else: raise RuntimeError( f"Invalid lr scheduler '{args.lr_scheduler}'. Only MultiStepLR and CosineAnnealingLR are supported." ) if args.resume: checkpoint = torch.load(args.resume, map_location="cpu") model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.amp: scaler.load_state_dict(checkpoint["scaler"]) if args.test_only: torch.backends.cudnn.deterministic = True evaluate(model, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, scaler) lr_scheduler.step() if args.output_dir: checkpoint = { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "args": args, "epoch": epoch, } if args.amp: checkpoint["scaler"] = scaler.state_dict() utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth")) utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth")) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Training time {total_time_str}")
def trainModel(): # use our dataset and defined transformations dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) # split the dataset in train and test set torch.manual_seed(1) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # get the model using our helper function model, in_feat, hidd_layer = get_instance_segmentation_model(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 10 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) return model
#torch.save(mask_rcnn.state_dict(), save_param) # train for one epoch, printing every 10 iterations print(save_param) train_one_epoch(mask_rcnn, optimizer, data_loader, device, epoch, print_freq=100) # update the learning rate lr_scheduler.step() # evaluate on the test dataset #print('\n') #print("trained_param_4/epoch_00%02d.param" % epoch) #mask_rcnn.load_state_dict(torch.load("trained_param_4/epoch_00%02d.param" % epoch)) evaluate(mask_rcnn, data_loader_test, device=device) #save_param = "trained_param_8_fresh/epoch_{:04d}.param".format(epoch) torch.save(mask_rcnn.state_dict(), save_param) ''' for epoch in range(init_epoch, init_epoch + num_epochs): #save_param = "trained_param_3_fresh/epoch_{:04d}.param".format(epoch) #torch.save(mask_rcnn.state_dict(), save_param) # train for one epoch, printing every 10 iterations #train_one_epoch(mask_rcnn, optimizer, data_loader, device, epoch, print_freq=100) # update the learning rate #lr_scheduler.step() # evaluate on the test dataset print('\n') name = "trained_param_8/epoch_00%02d.param" % epoch
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', duplication_factor=200, args=args) dataset_val = build_dataset(image_set='val', duplication_factor=10, args=args) dataset_train_size = len(dataset_train) dataset_val_size = len(dataset_train) print("training dataset size: ", dataset_train_size) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=True) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, WandbEvaluator(epoch), epoch, num_batches=(dataset_train_size // args.batch_size) ) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return # wandb monitor model during training wandb.config.update(args) # wandb.watch(model) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, num_batches=(dataset_train_size // args.batch_size), max_norm=args.clip_max_norm, postprocessors=postprocessors, wandb_evaluator=WandbEvaluator(epoch) ) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, WandbEvaluator(epoch), epoch, num_batches=(dataset_val_size // args.batch_size) ) log_stats = {**{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} wandb.log(log_stats) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): print('___main____---') # 在GPU上训练,若无GPU,可选择在CPU上训练 device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # 我们的数据集只有两个类 - 背景和人 num_classes = 2 # 使用我们的数据集和定义的转换 dataset = PennFudanDataset('PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False)) # 在训练和测试集中拆分数据集 indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # 定义训练和验证数据加载器 data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=1, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=1, collate_fn=utils.collate_fn) # 使用我们的辅助函数获取模型 model = get_model_instance_segmentation(num_classes) # 将我们的模型迁移到合适的设备 model.to(device) # 构造一个优化器 params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # 和学习率调度程序 lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # 训练10个epochs num_epochs = 1 for epoch in range(num_epochs): # 训练一个epoch,每10次迭代打印一次 train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # 更新学习速率 lr_scheduler.step() # 在测试集上评价 evaluate(model, data_loader_test, device=device) print("That's it!")
def train(model, trainloader, testloader, criterion, optimizer, scheduler, num_epochs, device, mode, detection): """ Train and evaluate a model with CPU or GPU. :param model: classifier to train on top of mobilenet :param trainloader: dataloader for trainset :param testloader: dataloader for testset :param criterion: loss function :param optimizer: optimization method :param scheduler: learning rate scheduler for adaptive learning :param num_epochs: number of epochs :param device: device to train on (CPU or GPU) :param mode: trainings mode :param detection: train object detection or classification :param mtcnn: MTCNN model for face detection :return: None """ print("Training on:", device) if detection and mode == 'faster_rcnn': for epoch in range(num_epochs): train_one_epoch(model, optimizer, trainloader, device, epoch + 1, print_freq=10) scheduler.step() print("Evaluation on trainset:") evaluate(model, trainloader, device=device) print("Evaluation on testset:") evaluate(model, testloader, device=device) else: # log loss_hist = [] train_acc_hist = [] test_acc_hist = [] for epoch in range(num_epochs): # training start = time.time() model.train() for i, (img, label) in enumerate(trainloader): img = img.to(device) label = label.to(device) optimizer.zero_grad() out = model(img) loss = criterion(out, label) loss.backward() optimizer.step() # record curr_loss = torch.mean(loss).item() running_loss = (curr_loss if ((i == 0) and (epoch == 0)) else running_loss + curr_loss) scheduler.step() # evaluation model.eval() running_loss /= len(trainloader) train_acc = evaluate_acc(model, trainloader, device) test_acc = evaluate_acc(model, testloader, device) loss_hist.append(running_loss) train_acc_hist.append(train_acc) test_acc_hist.append(test_acc) print( 'epoch {} \t loss {:.5f} \t train acc {:.3f} \t test acc {:.3f} \t time {:.1f} sec' .format(epoch + 1, running_loss, train_acc, test_acc, time.time() - start)) # create directory if not os.path.exists('./plots'): os.mkdir('./plots') if not os.path.exists('./models'): os.mkdir('./models') if not os.path.exists('./logs'): os.mkdir('./logs') # create name extension name = '_' + mode # save loss plot plt.figure(num=None, figsize=(8, 6)) plt.plot(loss_hist) plt.grid(True, which="both") plt.xlabel('epoch', fontsize=14) plt.ylabel('average loss', fontsize=14) plt.savefig(os.path.join('./plots', 'loss' + name + '.png')) # save train accuracy plot plt.figure(num=None, figsize=(8, 6)) plt.plot(train_acc_hist) plt.grid(True, which='both') plt.xlabel('epoch', fontsize=14) plt.ylabel('accuracy', fontsize=14) plt.savefig(os.path.join('./plots', 'train_acc' + name + '.png')) # save test accuracy plot plt.figure(num=None, figsize=(8, 6)) plt.plot(test_acc_hist) plt.grid(True, which='both') plt.xlabel('epoch', fontsize=14) plt.ylabel('accuracy', fontsize=14) plt.savefig(os.path.join('./plots', 'test_acc' + name + '.png')) # close all figures plt.close("all") # save model weights torch.save( model.to('cpu').state_dict(), os.path.join('./models', 'net' + name + '.pt')) # save logs file = open(os.path.join('./logs', 'log' + name + '.txt'), 'w') print('Final Loss:', loss_hist[-1], file=file) print('Final Train Accuracy:', train_acc_hist[-1], file=file) print('Final Test Accuracy:', test_acc_hist[-1], file=file) # save variables with open(os.path.join('./logs', 'log' + name + '.pkl'), 'wb') as f: pickle.dump([loss_hist, train_acc_hist, test_acc_hist], f)
def main(args): utils.init_distributed_mode(args) print(args) # if args.distillation_type != 'none' and args.finetune and not args.eval: # raise NotImplementedError("Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True print('dataset build init....') dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) print('dataset build finish....') if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, # num_replicas=num_tasks, num_replicas=0, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, # num_replicas=num_tasks, num_replicas=0, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) print('data loader init....') data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int( 1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=None, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') if 'model' in checkpoint: model_without_ddp.load_state_dict(checkpoint['model']) else: model_without_ddp.load_state_dict(checkpoint) state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # # interpolate position embedding # pos_embed_checkpoint = checkpoint_model['pos_embed'] # embedding_size = pos_embed_checkpoint.shape[-1] # num_patches = model.patch_embed.num_patches # num_extra_tokens = model.pos_embed.shape[-2] - num_patches # # height (== width) for the checkpoint position embedding # orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # # height (== width) for the new position embedding # new_size = int(num_patches ** 0.5) # # class_token and dist_token are kept unchanged # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # # only the position tokens are interpolated # pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] # pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) # pos_tokens = torch.nn.functional.interpolate( # pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) # pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) # new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) # checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None # if args.model_ema: # # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper # model_ema = ModelEma( # model, # decay=args.model_ema_decay, # device='cpu' if args.model_ema_force_cpu else '', # resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() # teacher_model = None # if args.distillation_type != 'none': # assert args.teacher_path, 'need to specify teacher-path when using distillation' # print(f"Creating teacher model: {args.teacher_model}") # teacher_model = create_model( # args.teacher_model, # pretrained=False, # num_classes=args.nb_classes, # global_pool='avg', # ) # if args.teacher_path.startswith('https'): # checkpoint = torch.hub.load_state_dict_from_url( # args.teacher_path, map_location='cpu', check_hash=True) # else: # checkpoint = torch.load(args.teacher_path, map_location='cpu') # teacher_model.load_state_dict(checkpoint['model']) # teacher_model.to(device) # teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is 'none' # criterion = DistillationLoss( # criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau # ) criterion = DistillationLoss(criterion, None, 'none', 0, 0) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') if 'model' in checkpoint: model_without_ddp.load_state_dict(checkpoint['model']) else: model_without_ddp.load_state_dict(checkpoint) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.model_ema: # utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 max_epoch_dp_warm_up = 100 if 'pvt_tiny' in args.model or 'pvt_small' in args.model: max_epoch_dp_warm_up = 0 if args.start_epoch < max_epoch_dp_warm_up: model_without_ddp.reset_drop_path(0.0) for epoch in range(args.start_epoch, args.epochs): if args.fp32_resume and epoch > args.start_epoch + 1: args.fp32_resume = False loss_scaler._scaler = torch.cuda.amp.GradScaler( enabled=not args.fp32_resume) if epoch == max_epoch_dp_warm_up: model_without_ddp.reset_drop_path(args.drop_path) if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '', # keep in eval mode during finetuning fp32=args.fp32_resume) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, # 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
return F.softmax(output, dim=dim) if loss_fn == "nll": return F.log_softmax(output, dim=dim) if loss_fn in ["bce", "wbce", "wbce1"]: return torch.sigmoid(output) # In[ ]: for epoch in range(0, 20): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizerNew, TrainLoaderNew, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, valLoaderNew, device=device) model_file = 'modelobj/fasterrcnn_model_' + str(epoch) + '.pth' torch.save({'modelObjectDetection_state_dict': model.state_dict()},model_file) print('\nSaved model to ' + model_file ) !zip -r /content/modelobj.zip /content/modelobj from google.colab import files files.download("/content/modelobj.zip") '''from google.colab import drive drive.mount('/data/') from pathlib import Path base_dir = ('/data/My Drive')'''
def main(args): utils.init_distributed_mode(args) print("args: {}".format(args)) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # special process to control whether freeze backbone args.model.train_backbone = args.lr_backbone > 0 model, criterion, postprocessors = build_model(args.model) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=args.lr_gamma) dataset_train = build_dataset( image_set='train', args=args.dataset, model_stride=model_without_ddp.backbone.stride) dataset_val = build_dataset(image_set='val', args=args.dataset, model_stride=model_without_ddp.backbone.stride) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 benchmark_test_parser = benchmark_test.get_args_parser() benchmark_test_args = benchmark_test_parser.get_defaults() benchmark_test_args.tracker.model = args.model # overwrite the parameters about network model benchmark_test_args.result_path = Path( os.path.join(args.output_dir, 'benchmark')) benchmark_test_args.dataset_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'benchmark') benchmark_eval_parser = benchmark_eval.get_args_parser() benchmark_eval_args = benchmark_eval_parser.get_defaults() benchmark_eval_args.tracker_path = benchmark_test_args.result_path best_eao = 0 best_ar = [0, 10] # accuracy & robustness print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) # training train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every args.model_save_step epochs if (epoch + 1) % args.lr_drop == 0 or ( epoch + 1) % args.model_save_step == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # hack: only inference model utils.save_on_master({'model': model_without_ddp.state_dict()}, output_dir / 'checkpoint_only_inference.pth') # evalute val_stats = evaluate(model, criterion, postprocessors, data_loader_val, device, args.output_dir) log_stats = { 'epoch': epoch, **{f'train_{k}': v for k, v in train_stats.items()}, **{f'val_{k}': v for k, v in val_stats.items()}, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # evualute with benchmark if utils.is_main_process(): if ( epoch + 1 ) % args.benchmark_test_step == 0 and epoch > args.benchmark_start_epoch: tracker = build_tracker(benchmark_test_args.tracker, model=model_without_ddp, postprocessors=postprocessors) benchmark_test_args.model_name = "epoch" + str(epoch) benchmark_start_time = time.time() benchmark_test.main(benchmark_test_args, tracker) benchmark_time = time.time() - benchmark_start_time benchmark_eval_args.model_name = "epoch" + str(epoch) benchmark_eval_args.tracker_prefix = "epoch" + str(epoch) eval_results = benchmark_eval.main(benchmark_eval_args) eval_result = list(eval_results.values())[0] if benchmark_test_args.dataset in ['VOT2018', 'VOT2019']: if args.output_dir: with (output_dir / str("benchmark_" + benchmark_test_args.dataset + ".txt")).open("a") as f: f.write("epoch: " + str(epoch) + ", best EAO: " + str(best_eao) + ", " + json.dumps(eval_result) + "\n") if best_eao < eval_result['EAO']: best_eao = eval_result['EAO'] if args.output_dir: best_eao_int = int(best_eao * 1000) # record: only inference model utils.save_on_master( {'model': model_without_ddp.state_dict()}, output_dir / f'checkpoint{epoch:04}_best_eao_{best_eao_int:03}_only_inference.pth' ) if best_ar[0] < eval_result['accuracy'] and best_ar[ 1] > eval_result['robustness']: best_ar[0] = eval_result['accuracy'] best_ar[1] = eval_result['robustness'] if args.output_dir: best_accuracy_int = int(best_ar[0] * 1000) best_robustness_int = int(best_ar[1] * 1000) # record: only inference model utils.save_on_master( {'model': model_without_ddp.state_dict()}, output_dir / f'checkpoint{epoch:04}_best_ar_{best_accuracy_int:03}_{best_robustness_int:03}_only_inference.pth' ) print("benchmark time: {}".format(benchmark_time)) if args.distributed: torch.distributed.barrier() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 3.0 * args.batch_size), shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') if 'model' in checkpoint.keys(): checkpoint_model = checkpoint['model'] else: checkpoint_model = checkpoint state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias', 'trans_cls_head.weight', 'trans_cls_head.bias', 'conv_cls_head.weight', 'conv_cls_head.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] if 'pos_embed' in checkpoint_model.keys(): # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute( 0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0 # args.lr = linear_scaled_lr optimizer = create_optimizer(args, model) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') # pdb.set_trace() if 'model' in checkpoint.keys(): model_without_ddp.load_state_dict(checkpoint['model']) else: model_without_ddp.load_state_dict(checkpoint) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'args': args, }, checkpoint_path) if epoch % args.evaluate_freq == 0: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
indices = [random.randint(0, len(DS_eval) - 1) for _ in range(5)] img_arr = [DS_eval.get_img(_) for _ in indices] DS_eval = data.Subset(DS_eval, indices) img, label = [], [] for i, l in DS_eval: img.append(i) label.append(l) DS_eval = EvalSet(img, label, img_arr) DL_eval = data.DataLoader(DS_eval, batch_size=5, shuffle=False, collate_fn=utils.collate_fn, num_workers=12) evaluate(net, DL_eval, device) predictions = [] net.to(device) net.eval() with torch.no_grad(): image, targets = next(iter(DL_eval)) print(image[0].shape) image = list(img.to(device) for img in image) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] predictions = net(image) # Returns losses and detections final_img = [] ans = [] for i in range(len(image)): xy_lst = predictions[i]['boxes'].to(torch.device("cpu")).numpy() print(xy_lst)
# gamma=0.1) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.3, patience=5) num_epochs = 100 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_metric_logger = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) content = f"epoch: {epoch:2d}; loss = {train_metric_logger.meters['loss']}; loss_mask = {train_metric_logger.meters['loss_mask']}" print(content) logger.info(content) # update the learning rate # lr_scheduler.step() # evaluate on the test dataset coco_evaluator, test_metric_logger = evaluate(model, data_loader_test, device=device) content = f"epoch: {epoch:2d}; loss = {test_metric_logger.meters['model_time']}; loss_mask = {test_metric_logger.meters['evaluator_time']}" print(content) logger.info(content) torch.save( model.state_dict(), '/root/code/model_state/mask_rcnn/mask_rcnn_0409_%d.pth' % epoch)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [ { "params": [p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], "lr": args.lr, }, { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], "lr": args.lr_backbone, }, { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad], "lr": args.lr * args.lr_linear_proj_mult, } ] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False) unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir ) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir ) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has two classes only - background and person num_classes = 2 # use our dataset and defined transformations dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # and a learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # let's train it for 10 epochs num_epochs = 10 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) print("That's it!") torch.save(model.state_dict(), 'data/PennFudanPed/weight.pt') print('saved weights of the model')
def main(args): writer = SummaryWriter(log_dir=f'logs/{args.comment}', filename_suffix='') test_cuda() writer.add_hparams(args.__dict__, {}) utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, epoch=0, writer=writer, args=args) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, writer=writer, args=args) for key, val in train_stats.items(): writer.add_scalar('train/' + key, val, epoch) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / f'checkpoint_{args.comment}.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append( output_dir / f'checkpoint{epoch:04}_{args.comment}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, epoch=epoch, writer=writer, args=args) for key, val in test_stats.items(): writer.add_scalar('test/' + key, val, epoch) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def train_mydata(): # use the PennFudan dataset and defined transformations dataset = PennFudanDataset(dataset_dir, get_transform(train=True)) dataset_test = PennFudanDataset(dataset_dir, get_transform(train=False)) # split the dataset in train and test set torch.manual_seed(1) indices = torch.randperm(len(dataset)).tolist() dataset = torch.utils.data.Subset(dataset, indices[:-50]) dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) # define training and validation data loaders data_loader = torch.utils.data.DataLoader( dataset, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # the dataset has two classes only - background and person num_classes = 2 # 只改输出类别数 # model = get_instance_segmentation_model(num_classes) # 更换backbone model = maskrcnn_resnet18_fpn(num_classes) # move model to the right device model.to(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # the learning rate scheduler decreases the learning rate by 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # training num_epochs = 10 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset evaluate(model, data_loader_test, device=device) # test # put the model in evaluation mode model.eval() for i in range(10): img, _ = dataset_test[i] with torch.no_grad(): prediction = model([img.to(device)]) src = img.mul(255).permute(1, 2, 0).byte().numpy() result = prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy() result = np.expand_dims(result, -1).repeat(3, axis=-1) result = cv2.addWeighted(src, 0.5, result, 0.5, 0) cv2.imshow("result", result) cv2.waitKey(0)
def train(fold): training_data_path = "../input/jpeg/train/" model_path = "../input/" df = pd.read_csv("../input/train_folds.csv") device = "cpu" epochs = 50 train_bs = 32 valid_bs = 16 mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) df_train = df[df.kfold != fold].reset_index(drop=True) df_valid = df[df.kfold == fold].reset_index(drop=True) train_aug = albumentations.Compose( [ albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True), albumentations.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=15), albumentations.Flip(p=0.5) ] ) valid_aug = albumentations.Compose( [ albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True) ] ) train_images = df_train.image_name.values.tolist() train_images = [os.path.join(training_data_path, i + ".jpg") for i in train_images] train_targets = df_train.target.values valid_images = df_valid.image_name.values.tolist() valid_images = [os.path.join(training_data_path, i + ".jpg") for i in valid_images] valid_targets = df_valid.target.values train_dataset = ClassificationDataset( image_paths=train_images, targets=train_targets, resize=None, augmentations=train_aug, ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=train_bs, shuffle=True, num_workers=4 ) valid_dataset = ClassificationDataset( image_paths=valid_images, targets=valid_targets, resize=None, augmentations=valid_aug, ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=valid_bs, shuffle=False, num_workers=4 ) model = SEResNext50_32x4d(pretrained="imagenet") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=3, threshold=0.001, mode="max" ) es = EarlyStopping(patience=5, mode="max") for epoch in range(epochs): train_loss = engine.train(train_loader, model, optimizer, device=device) predictions, valid_loss = engine.evaluate( train_loader, model, device=device ) predictions = np.vstack((predictions)).ravel() auc = metrics.roc_auc_score(valid_targets, predictions) scheduler.step(auc) print(f"Epoch = {epoch}, AUC = {auc}") scheduler.step(auc) es(auc, model, model_path=f"model_fold_{fold}.bin") if es.early_stop: print("Early stopping") break
def main(args): print(args) # batch_size = 48 #72 # num_workers = 8 # num_classes = None cudnn.benchmark = True index_dataset = GLDDataset(root='../DATA/train', input_size=224, subset='index') test_dataset = GLDDataset(root='../DATA/test_1k_final/test_1k_final', input_size=224, subset='test') train_dataset = GLDDataset(root='../../data/train', input_size=224, subset='train') val_dataset = GLDDataset(root='../../data/train', input_size=224, subset='val') train_sample_list = train_dataset.gen_train_sample_list() sampler = WeightedRandomSampler(weights=train_sample_list, num_samples=1300000, replacement=False) index_dataloader = DataLoader(index_dataset, batch_size=128, shuffle=False, num_workers=args.num_workers, drop_last=False, pin_memory=True # worker_init_fn=_worker_init_fn_() ) test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=args.num_workers, drop_last=False, pin_memory=True # worker_init_fn=_worker_init_fn_() ) train_dataloader = DataLoader( train_dataset, # sampler=sampler, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False, pin_memory=True # worker_init_fn=_worker_init_fn_() ) val_dataloader = DataLoader( val_dataset, # sampler=test_sampler, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, drop_last=False, pin_memory=True # worker_init_fn=_worker_init_fn_() ) # model = create_model( # args.model, # pretrained=True, # num_classes=num_classes, # drop_rate=args.drop, # drop_path_rate=args.drop_path, # drop_block_rate=None, # ) model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=81313) if args.resume: checkpoint = torch.load(args.model_path, map_location='cpu') model.load_state_dict(checkpoint['model']) args.start_epoch = checkpoint['epoch'] + 1 # model = nn.DataParallel(model) model.cuda() optimizer = torch.optim.Adam(model.parameters(), args.lr, eps=args.opt_eps) criterion_contra = CosContrastiveLoss(margin=0.4) criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) #.mkdir(parents=True, exist_ok=True) if not os.path.exists(output_dir): os.mkdir(output_dir) print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): train_stats = train_one_epoch_contra(model, criterion, criterion_contra, train_dataloader, optimizer, epoch, args.epochs) val_stats = evaluate(val_dataloader, model) test_stats = test_retrieval(index_dataloader, test_dataloader, model) print( f"Accuracy of the network on the {len(val_dataset)} test images: {val_stats['acc1']:.2f}%" ) # if test_stats["acc1"] > max_accuracy: if True: if args.output_dir: checkpoint_paths = [ output_dir / 'checkpoint_e{}_{}.pth'.format(epoch, val_stats["acc1"]) ] for checkpoint_path in checkpoint_paths: # torch.save(model.state_dict(), checkpoint_path) torch.save( { 'model': model.state_dict(), # 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), # 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) max_accuracy = max(max_accuracy, val_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'val_{k}': v for k, v in val_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch } if args.output_dir: with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n")