def train(model, device, args, *, val_interval, bn_process=False, all_iters=None): optimizer = args.optimizer loss_function = args.loss_function scheduler = args.scheduler train_dataprovider = args.train_dataprovider t1 = time.time() Top1_err, Top5_err = 0.0, 0.0 model.train() for iters in range(1, val_interval + 1): scheduler.step() if bn_process: adjust_bn_momentum(model, iters) all_iters += 1 d_st = time.time() data, target = train_dataprovider.next() target = target.type(torch.LongTensor) data, target = data.to(device), target.to(device) data_time = time.time() - d_st output = model(data) loss = loss_function(output, target) optimizer.zero_grad() loss.backward() optimizer.step() prec1, prec5 = accuracy(output, target, topk=(1, 5)) Top1_err += 1 - prec1.item() / 100 Top5_err += 1 - prec5.item() / 100 if all_iters % args.display_interval == 0: printInfo = 'TRAIN Iter {}: lr = {:.6f},\tloss = {:.6f},\t'.format(all_iters, scheduler.get_lr()[0], loss.item()) + \ 'Top-1 err = {:.6f},\t'.format(Top1_err / args.display_interval) + \ 'Top-5 err = {:.6f},\t'.format(Top5_err / args.display_interval) + \ 'data_time = {:.6f},\ttrain_time = {:.6f}'.format(data_time, (time.time() - t1) / args.display_interval) logging.info(printInfo) t1 = time.time() Top1_err, Top5_err = 0.0, 0.0 if all_iters % args.save_interval == 0: save_checkpoint({ 'state_dict': model.state_dict(), }, all_iters) return all_iters
def fit(start_epoch, num_epochs, model, loss_func, opt, lr_scheduler, best_score, max_batches_per_iter_cnt, checkpoint_dir, train_loader, val_loader): for epoch in range(start_epoch, start_epoch + num_epochs): val_loss = train_one_epoch(model, loss_func, opt, lr_scheduler, max_batches_per_iter_cnt, train_loader, val_loader, epoch) if best_score > val_loss: best_score = val_loss save_as_best = True else: save_as_best = False save_checkpoint(epoch, model, opt, lr_scheduler, best_score, checkpoint_dir, save_as_best)
def main_worker(gpu_idx, configs): configs.gpu_idx = gpu_idx configs.device = torch.device('cpu' if configs.gpu_idx is None else 'cuda:{}'.format(configs.gpu_idx)) if configs.distributed: if configs.dist_url == "env://" and configs.rank == -1: configs.rank = int(os.environ["RANK"]) if configs.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url, world_size=configs.world_size, rank=configs.rank) configs.subdivisions = int(64 / configs.batch_size / configs.ngpus_per_node) else: configs.subdivisions = int(64 / configs.batch_size) configs.is_master_node = (not configs.distributed) or ( configs.distributed and (configs.rank % configs.ngpus_per_node == 0)) if configs.is_master_node: logger = Logger(configs.logs_dir, configs.saved_fn) logger.info('>>> Created a new logger') logger.info('>>> configs: {}'.format(configs)) tb_writer = SummaryWriter(log_dir=os.path.join(configs.logs_dir, 'tensorboard')) else: logger = None tb_writer = None # model model = create_model(configs) # load weight from a checkpoint if configs.pretrained_path is not None: assert os.path.isfile(configs.pretrained_path), "=> no checkpoint found at '{}'".format(configs.pretrained_path) model.load_state_dict(torch.load(configs.pretrained_path, map_location='cpu')) if logger is not None: logger.info('loaded pretrained model at {}'.format(configs.pretrained_path)) # resume weights of model from a checkpoint if configs.resume_path is not None: assert os.path.isfile(configs.resume_path), "=> no checkpoint found at '{}'".format(configs.resume_path) model.load_state_dict(torch.load(configs.resume_path, map_location='cpu')) if logger is not None: logger.info('resume training model from checkpoint {}'.format(configs.resume_path)) # Data Parallel model = make_data_parallel(model, configs) # Make sure to create optimizer after moving the model to cuda optimizer = create_optimizer(configs, model) lr_scheduler = create_lr_scheduler(optimizer, configs) configs.step_lr_in_epoch = False if configs.lr_type in ['multi_step', 'cosin', 'one_cycle'] else True # resume optimizer, lr_scheduler from a checkpoint if configs.resume_path is not None: utils_path = configs.resume_path.replace('Model_', 'Utils_') assert os.path.isfile(utils_path), "=> no checkpoint found at '{}'".format(utils_path) utils_state_dict = torch.load(utils_path, map_location='cuda:{}'.format(configs.gpu_idx)) optimizer.load_state_dict(utils_state_dict['optimizer']) lr_scheduler.load_state_dict(utils_state_dict['lr_scheduler']) configs.start_epoch = utils_state_dict['epoch'] + 1 if configs.is_master_node: num_parameters = get_num_parameters(model) logger.info('number of trained parameters of the model: {}'.format(num_parameters)) if logger is not None: logger.info(">>> Loading dataset & getting dataloader...") # Create dataloader train_dataloader, train_sampler = create_train_dataloader(configs) if logger is not None: logger.info('number of batches in training set: {}'.format(len(train_dataloader))) if configs.evaluate: val_dataloader = create_val_dataloader(configs) val_loss = validate(val_dataloader, model, configs) print('val_loss: {:.4e}'.format(val_loss)) return for epoch in range(configs.start_epoch, configs.num_epochs + 1): if logger is not None: logger.info('{}'.format('*-' * 40)) logger.info('{} {}/{} {}'.format('=' * 35, epoch, configs.num_epochs, '=' * 35)) logger.info('{}'.format('*-' * 40)) logger.info('>>> Epoch: [{}/{}]'.format(epoch, configs.num_epochs)) if configs.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_one_epoch(train_dataloader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer) if (not configs.no_val) and (epoch % configs.checkpoint_freq == 0): val_dataloader = create_val_dataloader(configs) print('number of batches in val_dataloader: {}'.format(len(val_dataloader))) val_loss = validate(val_dataloader, model, configs) print('val_loss: {:.4e}'.format(val_loss)) if tb_writer is not None: tb_writer.add_scalar('Val_loss', val_loss, epoch) # Save checkpoint if configs.is_master_node and ((epoch % configs.checkpoint_freq) == 0): model_state_dict, utils_state_dict = get_saved_state(model, optimizer, lr_scheduler, epoch, configs) save_checkpoint(configs.checkpoints_dir, configs.saved_fn, model_state_dict, utils_state_dict, epoch) if not configs.step_lr_in_epoch: lr_scheduler.step() if tb_writer is not None: tb_writer.add_scalar('LR', lr_scheduler.get_lr()[0], epoch) if tb_writer is not None: tb_writer.close() if configs.distributed: cleanup()
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) netG = moco.builder.MaskGenerator() netD = moco.builder.MoCo(models.__dict__[args.arch], args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(netG) print(netD) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) netG.cuda(args.gpu) netD.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) netG = torch.nn.parallel.DistributedDataParallel( netG, device_ids=[args.gpu], find_unused_parameters=True) netD = torch.nn.parallel.DistributedDataParallel( netD, device_ids=[args.gpu], find_unused_parameters=True) else: netG.cuda() netD.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set netG = torch.nn.parallel.DistributedDataParallel(netG) netD = torch.nn.parallel.DistributedDataParallel(netD) elif args.gpu is not None: torch.cuda.set_device(args.gpu) netG = netG.cuda(args.gpu) netD = netD.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. pass # raise NotImplementedError("Only DistributedDataParallel is supported.") for debug on cpu # torch.cuda.synchronize() optimizer_g = torch.optim.SGD(netG.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer_d = torch.optim.SGD(netD.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda(args.gpu) G_criterion = nn.L1Loss().cuda(args.gpu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] netD.load_state_dict(checkpoint['state_dict']) #optimizer_d.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if os.path.isfile(args.resumeG): print("=> loading checkpoint '{}'".format(args.resumeG)) if args.gpu is None: checkpoint = torch.load(args.resumeG) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resumeG, map_location=loc) args.start_epoch = checkpoint['epoch'] netG.load_state_dict(checkpoint['state_dict']) #optimizer_g.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resumeG, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resumeG)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') video_augmentation = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size, (0.2, 1)), ]) audio_augmentation = moco.loader.DummyAudioTransform() augmentation = {'video': video_augmentation, 'audio': audio_augmentation} augmentation_gpu = moco.loader.MoCoAugmentV2( args.crop_size) if args.aug_plus else moco.loader.MoCoAugment( args.crop_size) train_dataset = Kinetics400(traindir, args.frame_per_clip, args.step_between_clips, extensions='mp4', transform=augmentation, num_workers=4) train_sampler = RandomClipSampler(train_dataset.video_clips, 1) if args.distributed: # train_sampler = torch.utils.data.distributed.DistributedSampler(train_sampler) train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True, multiprocessing_context="fork") if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer_d, epoch, args) adjust_learning_rate(optimizer_g, epoch, args) # train for one epoch train(train_loader, augmentation_gpu, criterion, G_criterion, netG, netD, optimizer_g, optimizer_d, epoch, args, writer) if (epoch + 1) % 10 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netG.state_dict(), }, ckp_dir + '/netG', max_save=20, is_best=False) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': netD.state_dict(), }, ckp_dir + '/netD', max_save=20, is_best=False)
def train_loop(self, iterators, optimizers, run_dir, task="joint", epochs=100, min_epochs=0, patience=5, epoch_start=0, best_f1=None, epochs_no_improv=None, best_scores=None, criterion="re", mode="strict", train_key="train", dev_key="dev", save_all_tasks=False, gradient_accumulation=1, tensorboard_summary=True, **kwargs): # Validation or not if dev_key is not None: logging.info( "Starting train loop: {} epochs; {} min; {} patience".format( epochs, min_epochs, patience)) else: logging.info( "Starting train loop without validation for {} epochs".format( epochs)) patience = 0 min_epochs = epochs # tasks = iterators.keys() if best_f1 is None: best_f1 = {t: 0 for t in tasks} if epochs_no_improv is None: epochs_no_improv = {t: 0 for t in tasks} if best_scores is None: best_scores = {t: 0 for t in tasks} # Check for early stopping already matched (when reloading a checkpoint) if patience and epoch_start > min_epochs and epochs_no_improv[ criterion] >= patience: logging.info( "Early stopping after {} epochs without improvement.".format( patience)) else: writer = SummaryWriter( run_dir) if TENSORBOARD and tensorboard_summary else None # Training loop for epoch in range(epoch_start, epochs): logging.info("Epoch {}/{} :".format(epoch + 1, epochs)) train_losses = self.run_epoch( iterators, epoch, optimizers, writer, task=task, train_key=train_key, gradient_accumulation=gradient_accumulation) n_iter = (epoch + 1) * len(list(train_losses.values())[0]) # Log train losses + evaluate on dev if not None if "ner" in tasks: logging.info("Train NER Loss : {}".format( np.mean(train_losses["ner"]))) if dev_key is not None: ner_preds, _, ner_loss, ner_scores = self.evaluate_ner( iterators["ner"][dev_key]) logging.info("Dev NER Loss : {}".format(ner_loss)) if "re" in tasks: logging.info("Train RE Loss : {}".format( np.mean(train_losses["re"]))) if dev_key is not None: re_preds, _, re_loss, re_scores = self.evaluate_re( iterators["re"][dev_key], mode=mode) logging.info("Dev RE Loss : {}".format(re_loss)) # If validation : record current and best checkpoints + enable early stopping on dev score if dev_key is not None: # save checkpoint and scores scores = {} f1 = {} for t in tasks: f1[t] = locals()["{}_scores".format(t)]["ALL"]["f1"] scores[t] = locals()["{}_scores".format(t)] for t in f1.keys(): if f1[t] > best_f1[t] or epoch == 0: logging.info( "New best {} F1 score on dev : {}".format( t, f1[t])) if save_all_tasks or t == criterion: logging.info("Saving model...") best_f1[t] = f1[t] epochs_no_improv[t] = 0 is_best = True else: epochs_no_improv[t] += 1 is_best = False state = { 'epoch': epoch + 1, 'epochs_no_improv': epochs_no_improv, 'model': self.state_dict(), 'scores': scores, 'optimizers': { k: optimizer.state_dict() for k, optimizer in optimizers.items() } } if save_all_tasks or t == criterion: save_checkpoint(state, is_best, checkpoint=run_dir + '{}_checkpoint.pth.tar'.format(t), best=run_dir + '{}_best.pth.tar'.format(t)) if TENSORBOARD and tensorboard_summary: if "ner" in iterators.keys(): writer.add_scalars("ner_loss", {"dev": ner_loss}, n_iter) add_score(writer, ner_scores, n_iter, task="ner") if "re" in iterators.keys(): writer.add_scalars("re_loss", {"dev": re_loss}, n_iter) add_score(writer, re_scores, n_iter, task="re") # early stopping if patience and epoch > min_epochs and epochs_no_improv[ criterion] >= patience: logging.info( "Early stopping after {} epochs without improvement on {}." .format(patience, criterion)) break # Else : record current checkpoint else: state = { 'epoch': epoch + 1, 'epochs_no_improv': 0, 'model': self.state_dict(), 'optimizers': { k: optimizer.state_dict() for k, optimizer in optimizers.items() } } save_checkpoint(state, is_best=epoch == epochs - 1, checkpoint=run_dir + '{}_checkpoint.pth.tar'.format(criterion), best=run_dir + '{}_best.pth.tar'.format(criterion)) if TENSORBOARD and tensorboard_summary: writer.close()
) start_epoch = 1 if args.resume is not None: print('Resume training from %s' % args.resume) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_nll', 'te_acc', 'time'] train_utils.save_checkpoint( args.dir, start_epoch - 1, model_state=model.state_dict(), optimizer_state=optimizer.state_dict() ) test_res = {'loss': None, 'accuracy': None, 'nll': None} for epoch in range(start_epoch, args.epochs + 1): time_ep = time.time() lr = learning_rate_schedule(args.lr, epoch, args.epochs) train_utils.adjust_learning_rate(optimizer, lr) train_res = train_utils.train(loaders['train'], model, optimizer, criterion, regularizer, cuda=args.cuda) test_res = train_utils.test(loaders['test'], model, criterion, regularizer, cuda=args.cuda) if epoch % args.save_freq == 0: train_utils.save_checkpoint(
def PolarOffsetMain(args, cfg): if args.launcher == None: dist_train = False else: args.batch_size, cfg.LOCAL_RANK = getattr( common_utils, 'init_dist_%s' % args.launcher)(args.batch_size, args.tcp_port, args.local_rank, backend='nccl') dist_train = True cfg['DIST_TRAIN'] = dist_train output_dir = os.path.join('./output', args.tag) ckpt_dir = os.path.join(output_dir, 'ckpt') tmp_dir = os.path.join(output_dir, 'tmp') summary_dir = os.path.join(output_dir, 'summary') if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir, exist_ok=True) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir, exist_ok=True) if not os.path.exists(summary_dir): os.makedirs(summary_dir, exist_ok=True) if args.onlyval and args.saveval: results_dir = os.path.join(output_dir, 'test', 'sequences') if not os.path.exists(results_dir): os.makedirs(results_dir, exist_ok=True) for i in range(8, 9): sub_dir = os.path.join(results_dir, str(i).zfill(2), 'predictions') if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) if args.onlytest: results_dir = os.path.join(output_dir, 'test', 'sequences') if not os.path.exists(results_dir): os.makedirs(results_dir, exist_ok=True) for i in range(11, 22): sub_dir = os.path.join(results_dir, str(i).zfill(2), 'predictions') if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) log_file = os.path.join( output_dir, ('log_train_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))) logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK) logger.info('**********************Start logging**********************') gpu_list = os.environ[ 'CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys( ) else 'ALL' logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list) if dist_train: total_gpus = dist.get_world_size() logger.info('total_batch_size: %d' % (total_gpus * args.batch_size)) for key, val in vars(args).items(): logger.info('{:16} {}'.format(key, val)) log_config_to_file(cfg, logger=logger) if cfg.LOCAL_RANK == 0: os.system('cp %s %s' % (args.config, output_dir)) ### create dataloader if (not args.onlytest) and (not args.onlyval): train_dataset_loader = build_dataloader(args, cfg, split='train', logger=logger) val_dataset_loader = build_dataloader(args, cfg, split='val', logger=logger, no_shuffle=True, no_aug=True) elif args.onlyval: val_dataset_loader = build_dataloader(args, cfg, split='val', logger=logger, no_shuffle=True, no_aug=True) else: test_dataset_loader = build_dataloader(args, cfg, split='test', logger=logger, no_shuffle=True, no_aug=True) ### create model model = build_network(cfg) model.cuda() ### create optimizer optimizer = train_utils.build_optimizer(model, cfg) ### load ckpt ckpt_fname = os.path.join(ckpt_dir, args.ckpt_name) epoch = -1 other_state = {} if args.pretrained_ckpt is not None and os.path.exists(ckpt_fname): logger.info( "Now in pretrain mode and loading ckpt: {}".format(ckpt_fname)) if not args.nofix: if args.fix_semantic_instance: logger.info( "Freezing backbone, semantic and instance part of the model." ) model.fix_semantic_instance_parameters() else: logger.info( "Freezing semantic and backbone part of the model.") model.fix_semantic_parameters() optimizer = train_utils.build_optimizer(model, cfg) epoch, other_state = train_utils.load_params_with_optimizer_otherstate( model, ckpt_fname, to_cpu=dist_train, optimizer=optimizer, logger=logger) # new feature logger.info("Loaded Epoch: {}".format(epoch)) elif args.pretrained_ckpt is not None: train_utils.load_pretrained_model(model, args.pretrained_ckpt, to_cpu=dist_train, logger=logger) if not args.nofix: if args.fix_semantic_instance: logger.info( "Freezing backbone, semantic and instance part of the model." ) model.fix_semantic_instance_parameters() else: logger.info( "Freezing semantic and backbone part of the model.") model.fix_semantic_parameters() else: logger.info("No Freeze.") optimizer = train_utils.build_optimizer(model, cfg) elif os.path.exists(ckpt_fname): epoch, other_state = train_utils.load_params_with_optimizer_otherstate( model, ckpt_fname, to_cpu=dist_train, optimizer=optimizer, logger=logger) # new feature logger.info("Loaded Epoch: {}".format(epoch)) if other_state is None: other_state = {} ### create optimizer and scheduler lr_scheduler = None if lr_scheduler == None: logger.info('Not using lr scheduler') model.train( ) # before wrap to DistributedDataParallel to support fixed some parameters if dist_train: model = nn.parallel.DistributedDataParallel( model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()], find_unused_parameters=True) logger.info(model) if cfg.LOCAL_RANK == 0: writer = SummaryWriter(log_dir=summary_dir) logger.info('**********************Start Training**********************') rank = cfg.LOCAL_RANK best_before_iou = -1 if 'best_before_iou' not in other_state else other_state[ 'best_before_iou'] best_pq = -1 if 'best_pq' not in other_state else other_state['best_pq'] best_after_iou = -1 if 'best_after_iou' not in other_state else other_state[ 'best_after_iou'] global_iter = 0 if 'global_iter' not in other_state else other_state[ 'global_iter'] val_global_iter = 0 if 'val_global_iter' not in other_state else other_state[ 'val_global_iter'] best_tracking_loss = 10086 if 'best_tracking_loss' not in other_state else other_state[ 'best_tracking_loss'] ### test if args.onlytest: logger.info('----EPOCH {} Testing----'.format(epoch)) model.eval() if rank == 0: vbar = tqdm(total=len(test_dataset_loader), dynamic_ncols=True) for i_iter, inputs in enumerate(test_dataset_loader): with torch.no_grad(): if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): ret_dict = model(inputs, is_test=True, merge_evaluator_list=None, merge_evaluator_window_k_list=None, require_cluster=True) else: ret_dict = model(inputs, is_test=True, require_cluster=True, require_merge=True) common_utils.save_test_results(ret_dict, results_dir, inputs) if rank == 0: vbar.set_postfix({ 'fname': '/'.join(inputs['pcd_fname'][0].split('/')[-3:]) }) vbar.update(1) if rank == 0: vbar.close() logger.info("----Testing Finished----") return ### evaluate if args.onlyval: logger.info('----EPOCH {} Evaluating----'.format(epoch)) model.eval() min_points = 50 # according to SemanticKITTI official rule if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): merge_evaluator_list = [] merge_evaluator_window_k_list = [] for k in [1, 5, 10, 15]: merge_evaluator_list.append(init_eval(min_points)) merge_evaluator_window_k_list.append(k) else: before_merge_evaluator = init_eval(min_points=min_points) after_merge_evaluator = init_eval(min_points=min_points) if rank == 0: vbar = tqdm(total=len(val_dataset_loader), dynamic_ncols=True) for i_iter, inputs in enumerate(val_dataset_loader): inputs['i_iter'] = i_iter # torch.cuda.empty_cache() with torch.no_grad(): if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): ret_dict = model(inputs, is_test=True, merge_evaluator_list=merge_evaluator_list, merge_evaluator_window_k_list= merge_evaluator_window_k_list, require_cluster=True) else: ret_dict = model( inputs, is_test=True, before_merge_evaluator=before_merge_evaluator, after_merge_evaluator=after_merge_evaluator, require_cluster=True) ######################### # with open('./ipnb/{}_matching_list.pkl'.format(i_iter), 'wb') as fd: # pickle.dump(ret_dict['matching_list'], fd) ######################### if args.saveval: common_utils.save_test_results(ret_dict, results_dir, inputs) if rank == 0: vbar.set_postfix({ 'loss': ret_dict['loss'].item(), 'fname': '/'.join(inputs['pcd_fname'][0].split('/')[-3:]), 'ins_num': -1 if 'ins_num' not in ret_dict else ret_dict['ins_num'] }) vbar.update(1) if dist_train: if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): pass else: before_merge_evaluator = common_utils.merge_evaluator( before_merge_evaluator, tmp_dir) dist.barrier() after_merge_evaluator = common_utils.merge_evaluator( after_merge_evaluator, tmp_dir) if rank == 0: vbar.close() if rank == 0: ## print results if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): for evaluate, window_k in zip(merge_evaluator_list, merge_evaluator_window_k_list): logger.info("Current Window K: {}".format(window_k)) printResults(evaluate, logger=logger) else: logger.info("Before Merge Semantic Scores") before_merge_results = printResults(before_merge_evaluator, logger=logger, sem_only=True) logger.info("After Merge Panoptic Scores") after_merge_results = printResults(after_merge_evaluator, logger=logger) logger.info("----Evaluating Finished----") return ### train while True: epoch += 1 if 'MAX_EPOCH' in cfg.OPTIMIZE.keys(): if epoch > cfg.OPTIMIZE.MAX_EPOCH: break ### train one epoch logger.info('----EPOCH {} Training----'.format(epoch)) loss_acc = 0 if rank == 0: pbar = tqdm(total=len(train_dataset_loader), dynamic_ncols=True) for i_iter, inputs in enumerate(train_dataset_loader): # torch.cuda.empty_cache() torch.autograd.set_detect_anomaly(True) model.train() optimizer.zero_grad() inputs['i_iter'] = i_iter inputs['rank'] = rank ret_dict = model(inputs) if args.pretrained_ckpt is not None and not args.fix_semantic_instance: # training offset if args.nofix: loss = ret_dict['loss'] elif len(ret_dict['offset_loss_list']) > 0: loss = sum(ret_dict['offset_loss_list']) else: loss = torch.tensor(0.0, requires_grad=True) #mock pbar ret_dict['offset_loss_list'] = [loss] #mock writer elif args.pretrained_ckpt is not None and args.fix_semantic_instance and cfg.MODEL.NAME == 'PolarOffsetSpconvPytorchMeanshift': # training dynamic shifting loss = sum(ret_dict['meanshift_loss']) elif cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): loss = sum(ret_dict['tracking_loss']) ######################### # with open('./ipnb/{}_matching_list.pkl'.format(i_iter), 'wb') as fd: # pickle.dump(ret_dict['matching_list'], fd) ######################### else: loss = ret_dict['loss'] loss.backward() optimizer.step() if rank == 0: try: cur_lr = float(optimizer.lr) except: cur_lr = optimizer.param_groups[0]['lr'] loss_acc += loss.item() pbar.set_postfix({ 'loss': loss.item(), 'lr': cur_lr, 'mean_loss': loss_acc / float(i_iter + 1) }) pbar.update(1) writer.add_scalar('Train/01_Loss', ret_dict['loss'].item(), global_iter) writer.add_scalar('Train/02_SemLoss', ret_dict['sem_loss'].item(), global_iter) if 'offset_loss_list' in ret_dict and sum( ret_dict['offset_loss_list']).item() > 0: writer.add_scalar('Train/03_InsLoss', sum(ret_dict['offset_loss_list']).item(), global_iter) writer.add_scalar('Train/04_LR', cur_lr, global_iter) writer_acc = 5 if 'meanshift_loss' in ret_dict: writer.add_scalar('Train/05_DSLoss', sum(ret_dict['meanshift_loss']).item(), global_iter) writer_acc += 1 if 'tracking_loss' in ret_dict: writer.add_scalar('Train/06_TRLoss', sum(ret_dict['tracking_loss']).item(), global_iter) writer_acc += 1 more_keys = [] for k, _ in ret_dict.items(): if k.find('summary') != -1: more_keys.append(k) for ki, k in enumerate(more_keys): if k == 'bandwidth_weight_summary': continue ki += writer_acc writer.add_scalar( 'Train/{}_{}'.format(str(ki).zfill(2), k), ret_dict[k], global_iter) global_iter += 1 if rank == 0: pbar.close() ### evaluate after each epoch logger.info('----EPOCH {} Evaluating----'.format(epoch)) model.eval() min_points = 50 before_merge_evaluator = init_eval(min_points=min_points) after_merge_evaluator = init_eval(min_points=min_points) tracking_loss = 0 if rank == 0: vbar = tqdm(total=len(val_dataset_loader), dynamic_ncols=True) for i_iter, inputs in enumerate(val_dataset_loader): # torch.cuda.empty_cache() inputs['i_iter'] = i_iter inputs['rank'] = rank with torch.no_grad(): if cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): ret_dict = model(inputs, is_test=True, merge_evaluator_list=None, merge_evaluator_window_k_list=None, require_cluster=True) else: ret_dict = model( inputs, is_test=True, before_merge_evaluator=before_merge_evaluator, after_merge_evaluator=after_merge_evaluator, require_cluster=True) if rank == 0: vbar.set_postfix({'loss': ret_dict['loss'].item()}) vbar.update(1) writer.add_scalar('Val/01_Loss', ret_dict['loss'].item(), val_global_iter) writer.add_scalar('Val/02_SemLoss', ret_dict['sem_loss'].item(), val_global_iter) if 'offset_loss_list' in ret_dict and sum( ret_dict['offset_loss_list']).item() > 0: writer.add_scalar('Val/03_InsLoss', sum(ret_dict['offset_loss_list']).item(), val_global_iter) if 'tracking_loss' in ret_dict: writer.add_scalar('Val/06_TRLoss', sum(ret_dict['tracking_loss']).item(), global_iter) tracking_loss += sum(ret_dict['tracking_loss']).item() more_keys = [] for k, _ in ret_dict.items(): if k.find('summary') != -1: more_keys.append(k) for ki, k in enumerate(more_keys): if k == 'bandwidth_weight_summary': continue ki += 4 writer.add_scalar('Val/{}_{}'.format(str(ki).zfill(2), k), ret_dict[k], val_global_iter) val_global_iter += 1 tracking_loss /= len(val_dataset_loader) if dist_train: try: before_merge_evaluator = common_utils.merge_evaluator( before_merge_evaluator, tmp_dir, prefix='before_') dist.barrier() after_merge_evaluator = common_utils.merge_evaluator( after_merge_evaluator, tmp_dir, prefix='after_') except: print("Someting went wrong when merging evaluator in rank {}". format(rank)) if rank == 0: vbar.close() if rank == 0: ## print results logger.info("Before Merge Semantic Scores") before_merge_results = printResults(before_merge_evaluator, logger=logger, sem_only=True) logger.info("After Merge Panoptic Scores") after_merge_results = printResults(after_merge_evaluator, logger=logger) ## save ckpt other_state = { 'best_before_iou': best_before_iou, 'best_pq': best_pq, 'best_after_iou': best_after_iou, 'global_iter': global_iter, 'val_global_iter': val_global_iter, 'best_tracking_loss': best_tracking_loss, } saved_flag = False if best_tracking_loss > tracking_loss and cfg.MODEL.NAME.startswith( 'PolarOffsetSpconvPytorchMeanshiftTracking' ) or cfg.MODEL.NAME.startswith('PolarOffsetSpconvTracking'): best_tracking_loss = tracking_loss if not saved_flag: states = train_utils.checkpoint_state( model, optimizer, epoch, other_state) train_utils.save_checkpoint( states, os.path.join( ckpt_dir, 'checkpoint_epoch_{}_{}.pth'.format( epoch, str(tracking_loss)[:5]))) saved_flag = True if best_before_iou < before_merge_results['iou_mean']: best_before_iou = before_merge_results['iou_mean'] if not saved_flag: states = train_utils.checkpoint_state( model, optimizer, epoch, other_state) train_utils.save_checkpoint( states, os.path.join( ckpt_dir, 'checkpoint_epoch_{}_{}_{}_{}.pth'.format( epoch, str(best_before_iou)[:5], str(best_pq)[:5], str(best_after_iou)[:5]))) saved_flag = True if best_pq < after_merge_results['pq_mean']: best_pq = after_merge_results['pq_mean'] if not saved_flag: states = train_utils.checkpoint_state( model, optimizer, epoch, other_state) train_utils.save_checkpoint( states, os.path.join( ckpt_dir, 'checkpoint_epoch_{}_{}_{}_{}.pth'.format( epoch, str(best_before_iou)[:5], str(best_pq)[:5], str(best_after_iou)[:5]))) saved_flag = True if best_after_iou < after_merge_results['iou_mean']: best_after_iou = after_merge_results['iou_mean'] if not saved_flag: states = train_utils.checkpoint_state( model, optimizer, epoch, other_state) train_utils.save_checkpoint( states, os.path.join( ckpt_dir, 'checkpoint_epoch_{}_{}_{}_{}.pth'.format( epoch, str(best_before_iou)[:5], str(best_pq)[:5], str(best_after_iou)[:5]))) saved_flag = True logger.info("Current best before IoU: {}".format(best_before_iou)) logger.info("Current best after IoU: {}".format(best_after_iou)) logger.info("Current best after PQ: {}".format(best_pq)) logger.info( "Current best tracking loss: {}".format(best_tracking_loss)) if lr_scheduler != None: lr_scheduler.step(epoch) # new feature
def main_worker(gpu_idx, configs): configs.gpu_idx = gpu_idx if configs.gpu_idx is not None: print("Use GPU: {} for training".format(configs.gpu_idx)) configs.device = torch.device('cuda:{}'.format(configs.gpu_idx)) if configs.distributed: if configs.dist_url == "env://" and configs.rank == -1: configs.rank = int(os.environ["RANK"]) if configs.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url, world_size=configs.world_size, rank=configs.rank) configs.is_master_node = (not configs.distributed) or ( configs.distributed and (configs.rank % configs.ngpus_per_node == 0)) if configs.is_master_node: logger = Logger(configs.logs_dir, configs.saved_fn) logger.info('>>> Created a new logger') logger.info('>>> configs: {}'.format(configs)) tb_writer = SummaryWriter( log_dir=os.path.join(configs.logs_dir, 'tensorboard')) else: logger = None tb_writer = None # model model = create_model(configs) # load weight from a checkpoint if configs.pretrained_path is not None: assert os.path.isfile( configs.pretrained_path), "=> no checkpoint found at '{}'".format( configs.pretrained_path) model.load_weights(weightfile=configs.pretrained_path) if logger is not None: logger.info('loaded pretrained model at {}'.format( configs.pretrained_path)) # resume weights of model from a checkpoint if configs.resume_path is not None: assert os.path.isfile( configs.resume_path), "=> no checkpoint found at '{}'".format( configs.resume_path) model.load_weights(weightfile=configs.resume_path) if logger is not None: logger.info('resume training model from checkpoint {}'.format( configs.pretrained_path)) # Data Parallel model = make_data_parallel(model, configs) # Make sure to create optimizer after moving the model to cuda optimizer = create_optimizer(configs, model) lr_scheduler = create_lr_scheduler(optimizer, configs) # resume optimizer, lr_scheduler from a checkpoint if configs.resume_path is not None: utils_path = configs.resume_path.replace('Model_', 'Utils_') assert os.path.isfile( utils_path), "=> no checkpoint found at '{}'".format(utils_path) utils_state_dict = torch.load(utils_path, map_location='cuda:{}'.format( configs.gpu_idx)) optimizer.load_state_dict(utils_state_dict['optimizer']) lr_scheduler.load_state_dict(utils_state_dict['lr_scheduler']) configs.start_epoch = utils_state_dict['epoch'] + 1 if configs.is_master_node: num_parameters = get_num_parameters(model) logger.info('number of trained parameters of the model: {}'.format( num_parameters)) if logger is not None: logger.info(">>> Loading dataset & getting dataloader...") # Create dataloader train_loader, val_loader, train_sampler = create_train_val_dataloader( configs) if logger is not None: logger.info('number of batches in train set: {}'.format( len(train_loader))) if val_loader is not None: logger.info('number of batches in val set: {}'.format( len(val_loader))) if configs.evaluate: assert val_loader is not None, "The validation should not be None" eval_metrics = evaluate_one_epoch(val_loader, model, configs.start_epoch - 1, configs, logger) precision, recall, AP, f1, ap_class = eval_metrics print( 'Evaluate - precision: {}, recall: {}, AP: {}, f1: {}, ap_class: {}' .format(precision, recall, AP, f1, ap_class)) return for epoch in range(configs.start_epoch, configs.num_epochs + 1): if logger is not None: logger.info('{}'.format('*-' * 40)) logger.info('{} {}/{} {}'.format('=' * 35, epoch, configs.num_epochs, '=' * 35)) logger.info('{}'.format('*-' * 40)) logger.info('>>> Epoch: [{}/{}]'.format(epoch, configs.num_epochs)) if configs.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_one_epoch(train_loader, model, optimizer, lr_scheduler, epoch, configs, logger, tb_writer) if not configs.no_val: precision, recall, AP, f1, ap_class = evaluate_one_epoch( val_loader, model, epoch, configs, logger) val_metrics_dict = { 'precision': precision, 'recall': recall, 'AP': AP, 'f1': f1, 'ap_class': ap_class } if tb_writer is not None: tb_writer.add_scalars('Validation', val_metrics_dict, epoch) # Save checkpoint if configs.is_master_node and ((epoch % configs.checkpoint_freq) == 0): model_state_dict, utils_state_dict = get_saved_state( model, optimizer, lr_scheduler, epoch, configs) save_checkpoint(configs.checkpoints_dir, configs.saved_fn, model_state_dict, utils_state_dict, epoch) if tb_writer is not None: tb_writer.close() if configs.distributed: cleanup()
def main(): global args, best_acc1, device # Init seed np.random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) if args.dataset == 'miniImageNet': train_loader, val_loader = get_dataloader(args, 'train', 'val') in_channel = 3 feature_dim = 64 * 3 * 3 elif args.dataset == 'omniglot': train_loader, val_loader = get_dataloader(args, 'trainval', 'test') in_channel = 1 feature_dim = 64 else: raise ValueError(f"Dataset {args.dataset} is not supported") embedding = Embedding(in_channel).to(device) model = RelationNetwork(feature_dim).to(device) criterion = torch.nn.MSELoss() embed_optimizer = torch.optim.Adam(embedding.parameters(), args.lr) model_optimizer = torch.optim.Adam(model.parameters(), args.lr) cudnn.benchmark = True if args.resume: try: checkpoint = torch.load( sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1]) except Exception: checkpoint = torch.load(args.log_dir + '/model_best.pth') model.load_state_dict(checkpoint['model_state_dict']) embedding.load_state_dict(checkpoint['embedding_state_dict']) model_optimizer.load_state_dict( checkpoint['model_optimizer_state_dict']) embed_optimizer.load_state_dict( checkpoint['embed_optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] print(f"load checkpoint {args.exp_name}") else: start_epoch = 1 embed_scheduler = torch.optim.lr_scheduler.MultiplicativeLR( optimizer=embed_optimizer, lr_lambda=lambda epoch: 0.5) model_scheduler = torch.optim.lr_scheduler.MultiplicativeLR( optimizer=model_optimizer, lr_lambda=lambda epoch: 0.5) for _ in range(start_epoch): embed_scheduler.step() model_scheduler.step() print( f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}" ) for epoch in range(start_epoch, args.epochs + 1): train_loss = train(train_loader, model, embedding, model_optimizer, embed_optimizer, criterion, epoch) is_test = False if epoch % args.test_iter else True if is_test or epoch == args.epochs or epoch == 1: val_loss, acc1 = validate(val_loader, model, embedding, criterion, epoch) if acc1 >= best_acc1: is_best = True best_acc1 = acc1 else: is_best = False save_checkpoint( { 'model_state_dict': model.state_dict(), 'embedding_state_dict': embedding.state_dict(), 'model_optimizer_state_dict': model_optimizer.state_dict(), 'embed_optimizer_state_dict': embed_optimizer.state_dict(), 'best_acc1': best_acc1, 'epoch': epoch, }, is_best, args) if is_best: writer.add_scalar("BestAcc", acc1, epoch) print( f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}" ) else: print(f"[{epoch}/{args.epochs}] {train_loss:.3f}") embed_scheduler.step() model_scheduler.step() writer.close()
# measure elapsed time batch_time.update(time.time() - end) end = time.time() print('{batch}/{size} | Loss:{loss:.4f} | top1:{tp1:.4f} | AUROC:{ac:.4f}'.format( batch=batch_idx+1, size=len(val_loader), loss=losses.avg, tp1=top1.avg, ac=arc.avg)) return (losses.avg, top1.avg, arc.avg) for epoch in range(opt.start_epoch, opt.epochs): opt.lr = optimizer.state_dict()['param_groups'][0]['lr'] adjust_learning_rate(optimizer, epoch, opt) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr)) train_loss, train_acc, train_auroc = train(opt, train_loader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc, test_auroc = test(opt, val_loader, model, criterion, epoch, use_cuda) logger.append([opt.lr, train_loss, test_loss, train_acc, test_acc, train_auroc, test_auroc]) scheduler_warmup.step() is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict' : model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=opt.checkpoint)
def main(): global args, best_iou, iterations args = parser.parse_args() if args.tensorboard: from tensorboard_logger import configure print("Using tensorboard") configure("%s" % (args.dir)) offset_list = generate_offsets(args.num_offsets) # model configurations num_classes = args.num_classes num_offsets = args.num_offsets # model model = get_model(num_classes, num_offsets, args.arch, args.pretrain) model = model.cuda() # dataset trainset = COCODataset(args.train_img, args.train_ann, num_classes, offset_list, scale=args.scale, size=(args.train_image_size, args.train_image_size), limits=args.limits, crop=args.crop) trainloader = torch.utils.data.DataLoader(trainset, num_workers=4, batch_size=args.batch_size, shuffle=True) valset = COCODataset(args.val_img, args.val_ann, num_classes, offset_list, scale=args.scale, limits=args.limits) valloader = torch.utils.data.DataLoader(valset, num_workers=4, batch_size=4) num_train = len(trainset) num_val = len(valset) print('Training samples: {0} \n' 'Validation samples: {1}'.format(num_train, num_val)) # define optimizer optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_iou = checkpoint['best_iou'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) offset_list = checkpoint['offset'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError("=> no checkpoint found at '{}'".format( args.resume)) print("offsets are: {}".format(offset_list)) # define loss functions if args.loss == 'bce': print('Using Binary Cross Entropy Loss') criterion_cls = torch.nn.BCEWithLogitsLoss().cuda() elif args.loss == 'mbce': print('Using Weighted Multiclass BCE Loss') criterion_cls = MultiBCEWithLogitsLoss().cuda() elif args.loss == 'dice': print('Using Soft Dice Loss') criterion_cls = SoftDiceLoss().cuda() else: print('Using Cross Entropy Loss') criterion_cls = CrossEntropyLossOneHot().cuda() criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda() # define learning rate scheduler if not args.milestones: milestones = [args.epochs] else: milestones = args.milestones scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=args.start_epoch - 1) # start iteration count iterations = args.start_epoch * int(len(trainset) / args.batch_size) # define score metrics score_metrics_train = runningScore(num_classes, trainset.catNms) score_metrics = runningScore(num_classes, valset.catNms) offset_metrics_train = offsetIoU(offset_list) offset_metrics_val = offsetIoU(offset_list) # train for epoch in range(args.start_epoch, args.epochs): scheduler.step() iterations = train(trainloader, model, criterion_cls, criterion_ofs, optimizer, num_classes, args.batch_size, epoch, iterations, print_freq=args.print_freq, log_freq=args.log_freq, tensorboard=args.tensorboard, score_metrics=score_metrics_train, offset_metrics=offset_metrics_train, alpha=args.alpha) val_iou = validate(valloader, model, criterion_cls, criterion_ofs, num_classes, args.batch_size, epoch, iterations, print_freq=args.print_freq, log_freq=args.log_freq, tensorboard=args.tensorboard, score_metrics=score_metrics, offset_metrics=offset_metrics_val, alpha=args.alpha) # visualize some example outputs after each epoch if args.visualize: outdir = '{}/imgs/{}'.format(args.dir, epoch + 1) if not os.path.exists(outdir): os.makedirs(outdir) sample(num_classes, num_offsets, model, valloader, outdir) is_best = val_iou > best_iou best_iou = max(val_iou, best_iou) save_checkpoint( args.dir, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_iou': best_iou, 'optimizer': optimizer.state_dict(), 'offset': offset_list, }, is_best) print('Best validation mean iou: ', best_iou)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=============> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() print(model) # freeze all layers but the last fc # for name, param in model.named_parameters(): # if name not in ['fc.weight', 'fc.bias']: # param.requires_grad = False # init the fc layer model.fc = nn.Linear(512, args.num_class, bias=True) model.fc.weight.data.normal_(mean=0.0, std=0.01) model.fc.bias.data.zero_() # load from pre-trained, before DistributedDataParallel constructor if args.pretrained: if os.path.isfile(args.pretrained): print("=> loading checkpoint '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer if k.startswith('module.encoder_q' ) and not k.startswith('module.encoder_q.fc'): # remove prefix state_dict[k[len("module.encoder_q."):]] = state_dict[k] # delete renamed or unused k del state_dict[k] args.start_epoch = 0 msg = model.load_state_dict(state_dict, strict=False) assert set(msg.missing_keys) == {"fc.weight", "fc.bias"} print("=> loaded pre-trained model '{}'".format(args.pretrained)) else: print("=> no checkpoint found at '{}'".format(args.pretrained)) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model) #.cuda() for debug on cpu # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) # assert len(parameters) == 2 # fc.weight, fc.bias optimizer = torch.optim.SGD(parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code normalize_video = transforms_video.NormalizeVideo( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) video_augmentation_train = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.RandomResizedCropVideo(args.crop_size), transforms_video.RandomHorizontalFlipVideo(), normalize_video, ]) video_augmentation_val = transforms.Compose([ transforms_video.ToTensorVideo(), transforms_video.CenterCropVideo(args.crop_size), normalize_video, ]) data_dir = os.path.join(args.data, 'data') anno_dir = os.path.join(args.data, 'anno') audio_augmentation = moco.loader.DummyAudioTransform() train_augmentation = { 'video': video_augmentation_train, 'audio': audio_augmentation } val_augmentation = { 'video': video_augmentation_val, 'audio': audio_augmentation } train_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=True, transform=train_augmentation, num_workers=16) train_sampler = RandomClipSampler(train_dataset.video_clips, 10) if args.distributed: train_sampler = DistributedSampler(train_sampler) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, multiprocessing_context="fork") val_dataset = UCF101(data_dir, anno_dir, args.frame_per_clip, args.step_between_clips, fold=1, train=False, transform=val_augmentation, num_workers=16) # Do not use DistributedSampler since it will destroy the testing iteration process val_sampler = UniformClipSampler(val_dataset.video_clips, args.clip_per_video) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.clip_per_video, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, multiprocessing_context="fork") if args.evaluate: validate(val_loader, model, criterion, args) return if args.multiprocessing_distributed and args.gpu == 0: log_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format(args.log_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) writer = SummaryWriter(log_dir) else: writer = None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set val_loss, acc1, acc5 = validate(val_loader, model, criterion, args) if writer is not None: writer.add_scalar('lincls_val/loss', val_loss, epoch) writer.add_scalar('lincls_val/acc1', acc1, epoch) writer.add_scalar('lincls_val/acc5', acc5, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): ckp_dir = "{}_bs={}_lr={}_cs={}_fpc={}".format( args.ckp_dir, args.batch_size, args.lr, args.crop_size, args.frame_per_clip) save_checkpoint(epoch, { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, ckp_dir, max_save=1, is_best=is_best)
print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, opt.epochs, opt.lr)) train_loss, train_auroc = train(opt, train_loader, teacher_model, student_model, criterion, optimizer, epoch, use_cuda) test_loss, test_auroc = test(opt, val_target_loader, student_model, criterion, epoch, use_cuda) source_loss, source_auroc = test(opt, val_source_loader, student_model, criterion, epoch, use_cuda) logger.append([ opt.lr, train_loss, test_loss, source_loss, train_auroc, test_auroc, source_auroc ]) is_best = test_auroc + source_auroc > best_acc best_acc = max(test_auroc + source_auroc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': student_model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=checkpoint) scheduler_cosine.step() scheduler_step.step() if (epoch + 1) % 200 == 0: teacher_model.load_state_dict(student_model.state_dict())
def main(): global args, best_acc1, device # Init seed np.random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) if args.dataset.lower() == 'miniimagenet': train_loader, val_loader = get_dataloader(args, 'matching_train', 'test') in_channel = 3 lstm_input_size = 1600 elif args.dataset.lower() == 'omniglot': train_loader, val_loader = get_dataloader(args, 'trainval', 'test') in_channel = 1 lstm_input_size = 64 else: raise KeyError(f"Dataset {args.dataset} is not supported") model = MatchingNetworks(args.classes_per_it_tr, args.num_support_tr, args.num_query_tr, args.num_query_val, in_channel, args.lstm_layers, lstm_input_size, args.unrolling_steps, fce=True, distance_fn='cosine').to(device) criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), args.lr) cudnn.benchmark = True if args.resume: try: checkpoint = torch.load( sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1]) except Exception: checkpoint = torch.load(args.log_dir + '/model_best.pth') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] print(f"load checkpoint {args.exp_name}") else: start_epoch = 1 print( f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}" ) for epoch in range(start_epoch, args.epochs + 1): train_loss = train(train_loader, model, optimizer, criterion, epoch) is_test = False if epoch % args.test_iter else True if is_test or epoch == args.epochs or epoch == 1: val_loss, acc1 = validate(val_loader, model, criterion, epoch) if acc1 >= best_acc1: is_best = True best_acc1 = acc1 else: is_best = False save_checkpoint( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'best_acc1': best_acc1, 'epoch': epoch, }, is_best, args) if is_best: writer.add_scalar("Acc/BestAcc", acc1, epoch) print( f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}" ) else: print(f"[{epoch}/{args.epochs}] {train_loss:.3f}") writer.close()
def main(argv, configPath=None): # arguments args = getArgs_(argv, configPath) saveDir = savePath(args) logger = infoLogger(logdir=saveDir, name=args.model) logger.info(argv) logger.debug(cfgInfo(args)) logger.info("CheckPoints path: {}".format(saveDir)) logger.debug("Model Name: {}".format(args.model)) train_dataset = BDD100K_Area_Seg(base_dir=args.dataPath, split='train', target_size=args.size) valid_dataset = BDD100K_Area_Seg(base_dir=args.dataPath, split='val', target_size=args.size) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_worker, pin_memory=True) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_worker, pin_memory=True) args.num_gpus, args.device = deviceSetting(logger=logger, device=args.device) # model model = Deeplabv3plus_Mobilenet(args.output_channels, output_stride=args.output_stride) optimizer, scheduler = create_optimizer_(model, args) loss_fn = MultiClassCriterion(loss_type=args.loss_type, ignore_index=args.ignore_index) model, trainData = modelDeploy(args, model, optimizer, scheduler, logger) tensorLogger = SummaryWriter(log_dir=os.path.join(saveDir, 'runs'), filename_suffix=args.model) logger.info("Tensorboard event log saved in {}".format( tensorLogger.log_dir)) logger.info('Start training...') # global_step = 0 start_epoch = trainData['epoch'] num_classes = args.output_channels extra_info_ckpt = '{}_{}_{}'.format(args.model, args.size[0], args.size[1]) for i_epoch in range(start_epoch, args.max_epoch): if i_epoch >= 29: optimizer.param_groups[0]["lr"] = np.float64(0.00001) trainData['epoch'] = i_epoch lossList, miouList = train_seg(model, train_loader, i_epoch, optimizer, loss_fn, num_classes, logger, tensorLogger, args=args) scheduler.step() trainData['loss'].extend(lossList) trainData['miou'].extend(miouList) valLoss, valMiou = val_seg(model, valid_loader, i_epoch, loss_fn, num_classes, logger, tensorLogger, args=args) trainData['val'].append([valLoss, valMiou]) best = valMiou > trainData['bestMiou'] if valMiou > trainData['bestMiou']: trainData['bestMiou'] = valMiou weights_dict = model.module.state_dict( ) if args.device == 'cuda' else model.state_dict() save_checkpoint( { 'trainData': trainData, 'model': weights_dict, 'optimizer': optimizer.state_dict(), }, is_best=best, dir=saveDir, extra_info=extra_info_ckpt, miou_val=valMiou, logger=logger) tensorLogger.close()
def main(): global args, best_acc1, device # Init seed np.random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) torch.cuda.manual_seed(args.manual_seed) if args.dataset == 'omniglot': train_loader, val_loader = get_dataloader(args, 'trainval', 'test') input_dim = 1 else: train_loader, val_loader = get_dataloader(args, 'train', 'val') input_dim = 3 if args.model == 'protonet': model = ProtoNet(input_dim).to(device) print("ProtoNet loaded") else: model = ResNet(input_dim).to(device) print("ResNet loaded") criterion = PrototypicalLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), args.lr) cudnn.benchmark = True if args.resume: try: checkpoint = torch.load( sorted(glob(f'{args.log_dir}/checkpoint_*.pth'), key=len)[-1]) except Exception: checkpoint = torch.load(args.log_dir + '/model_best.pth') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] print(f"load checkpoint {args.exp_name}") else: start_epoch = 1 scheduler = torch.optim.lr_scheduler.StepLR( optimizer=optimizer, gamma=args.lr_scheduler_gamma, step_size=args.lr_scheduler_step) print( f"model parameter : {sum(p.numel() for p in model.parameters() if p.requires_grad)}" ) for epoch in range(start_epoch, args.epochs + 1): train_loss = train(train_loader, model, optimizer, criterion, epoch) is_test = False if epoch % args.test_iter else True if is_test or epoch == args.epochs or epoch == 1: val_loss, acc1 = validate(val_loader, model, criterion, epoch) if acc1 >= best_acc1: is_best = True best_acc1 = acc1 else: is_best = False save_checkpoint( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer_state_dict': optimizer.state_dict(), }, is_best, args) if is_best: writer.add_scalar("BestAcc", acc1, epoch) print( f"[{epoch}/{args.epochs}] {train_loss:.3f}, {val_loss:.3f}, {acc1:.3f}, # {best_acc1:.3f}" ) else: print(f"[{epoch}/{args.epochs}] {train_loss:.3f}") scheduler.step() writer.close()
def main(): global args, best_iou, iterations args = parser.parse_args() if args.tensorboard: from tensorboard_logger import configure print("Using tensorboard") configure("%s" % (args.dir)) # model configurations num_classes = args.num_classes num_offsets = args.num_offsets if args.mode == 'offset': # offset only num_classes = 0 if args.mode == 'class': # class only num_offsets = 0 # model model = get_model(num_classes, num_offsets, args.arch, args.pretrain) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_iou = checkpoint['best_iou'] model.load_state_dict(checkpoint['model_state']) if 'offset' in checkpoint: # class mode doesn't have offset offset_list = checkpoint['offset'] print("offsets are: {}".format(offset_list)) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError("=> no checkpoint found at '{}'".format( args.resume)) # model distribution if args.gpu != -1: # DataParallel wrapper (synchronzied batchnorm edition) if len(args.gpu) > 1: model = DataParallelWithCallback(model, device_ids=args.gpu) model.cuda() # dataset if args.mode == 'all': offset_list = generate_offsets(80 / args.scale, args.num_offsets) trainset = AllDataset(args.train_img, args.train_ann, num_classes, offset_list, scale=args.scale, crop=args.crop, crop_size=(args.crop_size, args.crop_size), limits=args.limits) valset = AllDataset(args.val_img, args.val_ann, num_classes, offset_list, scale=args.scale, limits=args.limits) class_nms = trainset.catNms elif args.mode == 'class': offset_list = None trainset = ClassDataset(args.train_img, args.train_ann, scale=args.scale, crop=args.crop, crop_size=(args.crop_size, args.crop_size), limits=args.limits) valset = ClassDataset(args.val_img, args.val_ann, scale=args.scale, limits=args.limits) class_nms = trainset.catNms elif args.mode == 'offset': offset_list = generate_offsets(80 / args.scale, args.num_offsets) print("offsets are: {}".format(offset_list)) trainset = OffsetDataset(args.train_img, args.train_ann, offset_list, scale=args.scale, crop=args.crop, crop_size=args.crop_size, limits=args.limits) valset = OffsetDataset(args.val_img, args.val_ann, offset_list, scale=args.scale, limits=args.limits) class_nms = None trainloader = torch.utils.data.DataLoader(trainset, num_workers=4, batch_size=args.batch_size, shuffle=True) valloader = torch.utils.data.DataLoader(valset, num_workers=4, batch_size=4) num_train = len(trainset) num_val = len(valset) print('Training samples: {0} \n' 'Validation samples: {1}'.format(num_train, num_val)) # define optimizer optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) # # define loss functions criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda() if args.mode == 'all': criterion_cls = torch.nn.BCEWithLogitsLoss().cuda() criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda() elif args.mode == 'class': criterion_cls = torch.nn.BCEWithLogitsLoss().cuda() criterion_ofs = None elif args.mode == 'offset': criterion_cls = None if args.loss == 'bce': print('Using Binary Cross Entropy Loss') criterion_ofs = torch.nn.BCEWithLogitsLoss().cuda() elif args.loss == 'mbce': print('Using Weighted Multiclass BCE Loss') criterion_ofs = MultiBCEWithLogitsLoss().cuda() elif args.loss == 'dice': print('Using Soft Dice Loss (0 mode)') criterion_ofs = SoftDiceLoss(mode='0').cuda() else: print('Using Cross Entropy Loss') criterion_ofs = CrossEntropyLossOneHot().cuda() # define learning rate scheduler if not args.milestones: milestones = [args.epochs] else: milestones = args.milestones scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.2, last_epoch=args.start_epoch - 1) # start iteration count iterations = args.start_epoch * int(len(trainset) / args.batch_size) # train for epoch in range(args.start_epoch, args.epochs): scheduler.step() iterations = train(trainloader, model, optimizer, args.batch_size, epoch, iterations, criterion_cls=criterion_cls, class_nms=class_nms, criterion_ofs=criterion_ofs, offset_list=offset_list, print_freq=args.print_freq, log_freq=args.log_freq, tensorboard=args.tensorboard, score=args.score, alpha=args.alpha) val_iou = validate(valloader, model, args.batch_size, epoch, iterations, criterion_cls=criterion_cls, class_nms=class_nms, criterion_ofs=criterion_ofs, offset_list=offset_list, print_freq=args.print_freq, log_freq=args.log_freq, tensorboard=args.tensorboard, score=args.score, alpha=args.alpha) # visualize some example outputs after each epoch if args.visual_freq > 0 and epoch % args.visual_freq == 0: outdir = '{}/imgs/{}'.format(args.dir, epoch) if not os.path.exists(outdir): os.makedirs(outdir) sample(model, valloader, outdir, num_classes, num_offsets) # save checkpoint is_best = val_iou > best_iou best_iou = max(val_iou, best_iou) if args.gpu != -1 and len(args.gpu) > 1: state_dict = { 'epoch': epoch + 1, 'model_state': model.module.state_dict(), # remove 'module' in checkpoint 'best_iou': best_iou, 'optimizer': optimizer.state_dict() } else: state_dict = { 'epoch': epoch + 1, 'model_state': model.state_dict(), 'best_iou': best_iou, 'optimizer': optimizer.state_dict() } if args.mode != 'class': state_dict['offset'] = offset_list save_checkpoint(args.dir, state_dict, is_best) print('Best validation mean iou: ', best_iou)
def main_worker(gpu_idx, configs): configs.gpu_idx = gpu_idx if configs.gpu_idx is not None: print("Use GPU: {} for training".format(configs.gpu_idx)) configs.device = torch.device('cuda:{}'.format(configs.gpu_idx)) if configs.distributed: if configs.dist_url == "env://" and configs.rank == -1: configs.rank = int(os.environ["RANK"]) if configs.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url, world_size=configs.world_size, rank=configs.rank) configs.is_master_node = (not configs.distributed) or ( configs.distributed and (configs.rank % configs.ngpus_per_node == 0)) if configs.is_master_node: logger = Logger(configs.logs_dir, configs.saved_fn) logger.info('>>> Created a new logger') logger.info('>>> configs: {}'.format(configs)) tb_writer = SummaryWriter( log_dir=os.path.join(configs.logs_dir, 'tensorboard')) else: logger = None tb_writer = None # model model = create_model(configs) # Data Parallel model = make_data_parallel(model, configs) # Freeze model model = freeze_model(model, configs.freeze_modules_list) if configs.is_master_node: num_parameters = get_num_parameters(model) logger.info('number of trained parameters of the model: {}'.format( num_parameters)) optimizer = create_optimizer(configs, model) lr_scheduler = create_lr_scheduler(optimizer, configs) best_val_loss = np.inf earlystop_count = 0 is_best = False # optionally load weight from a checkpoint if configs.pretrained_path is not None: model = load_pretrained_model(model, configs.pretrained_path, gpu_idx, configs.overwrite_global_2_local) if logger is not None: logger.info('loaded pretrained model at {}'.format( configs.pretrained_path)) # optionally resume from a checkpoint if configs.resume_path is not None: checkpoint = resume_model(configs.resume_path, configs.arch, configs.gpu_idx) if hasattr(model, 'module'): model.module.load_state_dict(checkpoint['state_dict']) else: model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) best_val_loss = checkpoint['best_val_loss'] earlystop_count = checkpoint['earlystop_count'] configs.start_epoch = checkpoint['epoch'] + 1 if logger is not None: logger.info(">>> Loading dataset & getting dataloader...") # Create dataloader train_loader, val_loader, train_sampler = create_train_val_dataloader( configs) test_loader = create_test_dataloader(configs) if logger is not None: logger.info('number of batches in train set: {}'.format( len(train_loader))) if val_loader is not None: logger.info('number of batches in val set: {}'.format( len(val_loader))) logger.info('number of batches in test set: {}'.format( len(test_loader))) if configs.evaluate: assert val_loader is not None, "The validation should not be None" val_loss = evaluate_one_epoch(val_loader, model, configs.start_epoch - 1, configs, logger) print('Evaluate, val_loss: {}'.format(val_loss)) return for epoch in range(configs.start_epoch, configs.num_epochs + 1): # Get the current learning rate for param_group in optimizer.param_groups: lr = param_group['lr'] if logger is not None: logger.info('{}'.format('*-' * 40)) logger.info('{} {}/{} {}'.format('=' * 35, epoch, configs.num_epochs, '=' * 35)) logger.info('{}'.format('*-' * 40)) logger.info('>>> Epoch: [{}/{}] learning rate: {:.2e}'.format( epoch, configs.num_epochs, lr)) if configs.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss = train_one_epoch(train_loader, model, optimizer, epoch, configs, logger) loss_dict = {'train': train_loss} if not configs.no_val: val_loss = evaluate_one_epoch(val_loader, model, epoch, configs, logger) is_best = val_loss <= best_val_loss best_val_loss = min(val_loss, best_val_loss) loss_dict['val'] = val_loss if not configs.no_test: test_loss = evaluate_one_epoch(test_loader, model, epoch, configs, logger) loss_dict['test'] = test_loss # Write tensorboard if tb_writer is not None: tb_writer.add_scalars('Loss', loss_dict, epoch) # Save checkpoint if configs.is_master_node and (is_best or ( (epoch % configs.checkpoint_freq) == 0)): saved_state = get_saved_state(model, optimizer, lr_scheduler, epoch, configs, best_val_loss, earlystop_count) save_checkpoint(configs.checkpoints_dir, configs.saved_fn, saved_state, is_best, epoch) # Check early stop training if configs.earlystop_patience is not None: earlystop_count = 0 if is_best else (earlystop_count + 1) print_string = ' |||\t earlystop_count: {}'.format(earlystop_count) if configs.earlystop_patience <= earlystop_count: print_string += '\n\t--- Early stopping!!!' break else: print_string += '\n\t--- Continue training..., earlystop_count: {}'.format( earlystop_count) if logger is not None: logger.info(print_string) # Adjust learning rate if configs.lr_type == 'plateau': assert (not configs.no_val ), "Only use plateau when having validation set" lr_scheduler.step(val_loss) else: lr_scheduler.step() if tb_writer is not None: tb_writer.close() if configs.distributed: cleanup()
def main(): args = get_args() # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}{:02}{}'.format(local_time.tm_year % 2000, local_time.tm_mon, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) input_size = efficientnet_lite_params[args.model_name][2] use_gpu = False if torch.cuda.is_available(): use_gpu = True assert os.path.exists(args.train_dir) train_dataset = datasets.ImageFolder( args.train_dir, transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.RandomHorizontalFlip(0.5), transforms.ToTensor(), transforms.Normalize(MEAN_RGB, STDDEV_RGB) ])) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=use_gpu) train_dataprovider = DataIterator(train_loader) assert os.path.exists(args.val_dir) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( args.val_dir, transforms.Compose([ transforms.Resize(input_size + CROP_PADDING, interpolation=PIL.Image.BICUBIC), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize(MEAN_RGB, STDDEV_RGB) ])), batch_size=200, shuffle=False, num_workers=args.num_workers, pin_memory=use_gpu) val_dataprovider = DataIterator(val_loader) print('load data successfully') model = build_efficientnet_lite(args.model_name, args.num_classes) optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) model = model.to(device) all_iters = 0 if args.auto_continue: lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) print('load from checkpoint') for i in range(iters): scheduler.step() args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_dataprovider = train_dataprovider args.val_dataprovider = val_dataprovider if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') load_checkpoint(model, checkpoint) validate(model, device, args, all_iters=all_iters) exit(0) while all_iters < args.total_iters: all_iters = train(model, device, args, val_interval=args.val_interval, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) all_iters = train(model, device, args, val_interval=int(1280000 / args.batch_size), bn_process=True, all_iters=all_iters) validate(model, device, args, all_iters=all_iters) save_checkpoint({ 'state_dict': model.state_dict(), }, args.total_iters, tag='bnps-')