def main(): rank, world_size = dist_init() cfg.merge_from_file(args.cfg) if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) logger.info('dist init done!') train_dataloader = build_data_loader() model = get_model('BaseSiamModel').cuda().train() dist_model = DistModule(model) optimizer, lr_scheduler = build_optimizer_lr(dist_model.module, cfg.TRAIN.START_EPOCH) if cfg.TRAIN.BACKBONE_PRETRAIN: logger.info('load backbone from {}.'.format(cfg.TRAIN.BACKBONE_PATH)) model.backbone = load_pretrain(model.backbone, cfg.TRAIN.BACKBONE_PATH) logger.info('load backbone done!') if cfg.TRAIN.RESUME: logger.info('resume from {}'.format(cfg.TRAIN.RESUME_PATH)) model, optimizer, cfg.TRAIN.START_EPOCH = restore_from( model, optimizer, cfg.TRAIN.RESUME_PATH) logger.info('resume done!') elif cfg.TRAIN.PRETRAIN: logger.info('load pretrain from {}.'.format(cfg.TRAIN.PRETRAIN_PATH)) model = load_pretrain(model, cfg.TRAIN.PRETRAIN_PATH) logger.info('load pretrain done') dist_model = DistModule(model) train(train_dataloader, dist_model, optimizer, lr_scheduler)
def main(): global args, logger, v_id args = parser.parse_args() cfg = load_config(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info(args) # setup model if args.arch == 'Custom': from custom import Custom model = Custom(anchors=cfg['anchors']) else: parser.error('invalid architecture: {}'.format(args.arch)) if args.resume: assert isfile(args.resume), '{} is not a valid file'.format(args.resume) model = load_pretrain(model, args.resume) model.eval() device = torch.device('cuda' if (torch.cuda.is_available() and not args.cpu) else 'cpu') model = model.to(device) # setup dataset dataset = load_dataset(args.dataset) # VOS or VOT? if args.dataset in ['DAVIS','DAVIS2016', 'DAVIS2017', 'ytb_vos'] and args.mask: vos_enable = True # enable Mask output else: vos_enable = False total_lost = 0 # VOT iou_lists = [] # VOS speed_list = [] for v_id, video in enumerate(dataset.keys(), start=1): if args.video != '' and video != args.video: continue if vos_enable: iou_list, speed = track_vos(model, dataset[video], cfg['hp'] if 'hp' in cfg.keys() else None, args.mask, args.refine, args.dataset in ['DAVIS2017', 'ytb_vos'], device=device) iou_lists.append(iou_list) else: lost, speed = track_vot(model, dataset[video], cfg['hp'] if 'hp' in cfg.keys() else None, args.mask, args.refine, device=device) total_lost += lost speed_list.append(speed) # report final result if vos_enable: for thr, iou in zip(thrs, np.mean(np.concatenate(iou_lists), axis=0)): logger.info('Segmentation Threshold {:.2f} mIoU: {:.3f}'.format(thr, iou)) else: logger.info('Total Lost: {:d}'.format(total_lost)) logger.info('Mean Speed: {:.2f} FPS'.format(np.mean(speed_list)))
def main(): cfg.merge_from_file(args.cfg) if not os.path.exists(cfg.PRUNING.FINETUNE.LOG_DIR): os.makedirs(cfg.PRUNING.FINETUNE.LOG_DIR) init_log('global', logging.INFO) if cfg.PRUNING.FINETUNE.LOG_DIR: add_file_handler( 'global', os.path.join(cfg.PRUNING.FINETUNE.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) train_dataloader = build_data_loader() model = PruningSiamModel() # load model from the pruning model logger.info('load pretrain from {}.'.format( cfg.PRUNING.FINETUNE.PRETRAIN_PATH)) model = load_pretrain(model, cfg.PRUNING.FINETUNE.PRETRAIN_PATH) logger.info('load pretrain done') logger.info('begin to pruning the model') model = prune_model(model).cuda().train() logger.info('pruning finished!') optimizer, lr_scheduler = build_optimizer_lr( model, cfg.PRUNING.FINETUNE.START_EPOCH) if cfg.PRUNING.FINETUNE.RESUME: logger.info('resume from {}'.format(cfg.PRUNING.FINETUNE.RESUME_PATH)) model, optimizer, cfg.PRUNING.FINETUNE.START_EPOCH = restore_from( model, optimizer, cfg.PRUNING.FINETUNE.RESUME_PATH) logger.info('resume done!') train(train_dataloader, model, optimizer, lr_scheduler)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() # args通过解析获得的 init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') # 实例化一个记录器 logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps( cfg, indent=4))) # 转变成json格式的文件,缩进4格 if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() # 模型转移到GPU上 dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() # 多GPU训练 if args.resume and args.start_epoch != 0: # 这是在干啥?蒙蔽了!!!!! model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # 如何构建优化器和学习策略??? # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) # 返回一个logger对象,logging_INFO是日志的等级 if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') # 获取上面初始化的logger对象 logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) # 返回修改后的配置文件对象 logger.info("config \n{}".format(json.dumps(cfg, indent=4))) #json.loads()是将str转化成dict格式,json.dumps()是将dict转化成str格式。 if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format(args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from(model, optimizer, args.resume) dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def create_app(config_name): app = Flask(__name__) # 验证 CORS(app, supports_credentials=True) app.config.from_object(config[config_name]) config[config_name].init_app(app) ###初始化数据库 # db.init_app(app) # 返回数据中response为中文 app.config['JSON_AS_ASCII'] = False ###初始化日志### init_log() api.init_app(app) return app
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() args = args_process(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # build dataset train_loader, val_loader = build_data_loader(cfg) args.img_size = int(cfg['train_datasets']['search_size']) args.nms_threshold = float(cfg['train_datasets']['RPN_NMS']) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, opts=args, anchors=train_loader.dataset.anchors) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) else: raise Exception("Pretrained weights must be loaded!") model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() logger.info('model prepare done') logger = logging.getLogger('global') val_avg = AverageMeter() validation(val_loader, dist_model, cfg, val_avg)
def main(): cfg.merge_from_file(args.cfg) if not os.path.exists(cfg.META.LOG_DIR): os.makedirs(cfg.META.LOG_DIR) init_log("global", logging.INFO) if cfg.META.LOG_DIR: add_file_handler("global", os.path.join(cfg.META.LOG_DIR, "logs.txt"), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) model = MetaSiamModel().cuda() model = load_pretrain(model, cfg.META.PRETRAIN_PATH) # init meta train model.meta_train_init() # parametes want to optim optimizer = build_optimizer(model) dataloader = build_dataloader() meta_train(dataloader, optimizer, model)
def main(): cfg.merge_from_file(args.cfg) if not os.path.exists(cfg.GRAD.LOG_DIR): os.makedirs(cfg.GRAD.LOG_DIR) init_log("global", logging.INFO) if cfg.GRAD.LOG_DIR: add_file_handler("global", os.path.join(cfg.GRAD.LOG_DIR, "logs.txt"), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) model = get_model('GradSiamModel').cuda() model = load_pretrain(model, cfg.GRAD.PRETRAIN_PATH) # parametes want to optim optimizer = build_optimizer(model) dataloader = build_dataloader() if cfg.GRAD.RESUME: logger.info('resume from {}'.format(cfg.GRAD.RESUME_PATH)) model, optimizer, cfg.GRAD.START_EPOCH = restore_from( model, optimizer, cfg.GRAD.RESUME_PATH) logger.info('resume done!') model.freeze_model() train(dataloader, optimizer, model)
def main(): seed_torch(123456) cfg.merge_from_file(args.cfg) init_log('global', logging.INFO) base_model = get_model(cfg.MODEL_ARC) base_model = load_pretrain(base_model, args.snapshot).cuda().eval() # # if want test model pruned # base_model = prune_model(base_model).cuda().eval() # refine the model # if want to test real pruning # base_model = get_model(cfg.MODEL_ARC) # base_model = load_pretrain(base_model, cfg.PRUNING.FINETUNE.PRETRAIN_PATH) # load the mask # base_model = prune_model(base_model) # refine the model # base_model=load_pretrain(base_model,args.snapshot).cuda().eval() # load the finetune weight tracker = get_tracker(args.tracker, base_model) data_dir = os.path.join(cfg.TRACK.DATA_DIR, args.dataset) dataset = get_dataset(args.dataset, data_dir) if args.dataset in ['VOT2016', 'VOT2018']: vot_evaluate(dataset, tracker) elif args.dataset == 'GOT-10k': ope_evaluate(dataset, tracker)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg)
def main(): global xent_criterion, triplet_criterion, ment_criterion logger.info("init done") if os.path.exists(cfg.TRAIN.LOG_DIR): shutil.rmtree(cfg.TRAIN.LOG_DIR) os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) dataset, train_loader, _, _ = build_data_loader() model = BagReID_IBN(dataset.num_train_pids, dataset.num_train_mates) xent_criterion = CrossEntropyLabelSmooth(dataset.num_train_pids) triplet_criterion = TripletLoss(margin=cfg.TRAIN.TRI_MARGIN) ment_criterion = CrossEntropyMate(cfg.TRAIN.MATE_LOSS_WEIGHT) if cfg.TRAIN.OPTIM == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) else: optimizer = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, weight_decay=cfg.SOLVER.WEIGHT_DECAY) optimizers = [optimizer] schedulers = build_lr_schedulers(optimizers) if cfg.CUDA: model.cuda() if torch.cuda.device_count() > 1: model = DataParallel(model) if cfg.TRAIN.LOG_DIR: summary_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: summary_writer = None logger.info("model prepare done") start_epoch = cfg.TRAIN.START_EPOCH # start training for epoch in range(start_epoch, cfg.TRAIN.NUM_EPOCHS): train(epoch, train_loader, model, criterion, optimizers, summary_writer) for scheduler in schedulers: scheduler.step() # skip if not save model if cfg.TRAIN.EVAL_STEP > 0 and (epoch + 1) % cfg.TRAIN.EVAL_STEP == 0 \ or (epoch + 1) == cfg.TRAIN.NUM_EPOCHS: if cfg.CUDA and torch.cuda.device_count() > 1: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=False, save_dir=cfg.TRAIN.SNAPSHOT_DIR, filename='checkpoint_ep' + str(epoch + 1) + '.pth.tar')
def main(): init_log('global', logging.INFO) logger = logging.getLogger('global') global args, best_recall args = parser.parse_args() cfg = load_config(args.config) if args.dist: logger.info('dist:{}'.format(args.dist)) dist_init(args.port, backend=args.backend) # build dataset train_loader, val_loader = build_data_loader(args.dataset, cfg) # if args.arch == 'resnext_101_64x4d_deform_maskrcnn': # model = resnext_101_64x4d_deform_maskrcnn(cfg = cfg['shared']) # elif args.arch == 'FishMask': # model = FishMask(cfg = cfg['shared']) # else: # if args.arch.find('fpn'): # arch = args.arch.replace('fpn', '') # model = resnet_fpn.__dict__[arch](pretrained=False, cfg = cfg['shared']) # else: model = resnet.__dict__[args.arch](pretrained=False, cfg=cfg['shared']) logger.info('build model done') logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) trainable_params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(trainable_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_recall, arch = restore_from( model, optimizer, args.resume) model = model.cuda() if args.dist: broadcast_params(model) logger.info('build dataloader done') if args.evaluate: rc = validate(val_loader, model, cfg) logger.info('recall=%f' % rc) return # warmup to enlarge lr if args.start_epoch == 0 and args.warmup_epochs > 0: world_size = 1 try: world_size = dist.get_world_size() except Exception as e: print(e) rate = world_size * args.batch_size warmup_iter = args.warmup_epochs * len(train_loader) assert (warmup_iter > 1) gamma = rate**(1.0 / (warmup_iter - 1)) lr_scheduler = IterExponentialLR(optimizer, gamma) for epoch in range(args.warmup_epochs): logger.info('warmup epoch %d' % (epoch)) train(train_loader, model, lr_scheduler, epoch + 1, cfg, warmup=True) # overwrite initial_lr with magnified lr through warmup for group in optimizer.param_groups: group['initial_lr'] = group['lr'] logger.info('warmup for %d epochs done, start large batch training' % args.warmup_epochs) lr_scheduler = MultiStepLR(optimizer, milestones=args.step_epochs, gamma=0.1, last_epoch=args.start_epoch - 1) for epoch in range(args.start_epoch, args.epochs): logger.info('step_epochs:{}'.format(args.step_epochs)) lr_scheduler.step() lr = lr_scheduler.get_lr()[0] # train for one epoch train(train_loader, model, lr_scheduler, epoch + 1, cfg) if (epoch + 1) % 5 == 0 or epoch + 1 == args.epochs: # evaluate on validation set recall = validate(val_loader, model, cfg) # remember best prec@1 and save checkpoint is_best = recall > best_recall best_recall = max(recall, best_recall) logger.info('recall %f(%f)' % (recall, best_recall)) if (not args.dist) or (dist.get_rank() == 0): if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) save_path = os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch + 1)) torch.save( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.cpu().state_dict(), 'best_recall': best_recall, 'optimizer': optimizer.state_dict(), }, save_path)
def main(): global args, device, max_acc, writer max_acc = -1 args = parser.parse_args() if args.arch == 'SharpMask': trainSm = True args.hfreq = 1 args.gSz = args.iSz else: trainSm = False # Setup experiments results path pathsv = 'sharpmask/train' if trainSm else 'deepmask/train' args.rundir = join(args.rundir, pathsv) try: if not isdir(args.rundir): makedirs(args.rundir) except OSError as err: print(err) # Setup logger init_log('global', logging.INFO) add_file_handler('global', join(args.rundir, 'train.log'), logging.INFO) logger = logging.getLogger('global') logger.info('running in directory %s' % args.rundir) logger.info(args) writer = SummaryWriter(log_dir=join(args.rundir, 'tb')) # Get argument defaults (hastag #thisisahack) parser.add_argument('--IGNORE', action='store_true') defaults = vars(parser.parse_args(['--IGNORE'])) # Print all arguments, color the non-defaults for argument, value in sorted(vars(args).items()): reset = colorama.Style.RESET_ALL color = reset if value == defaults[argument] else colorama.Fore.MAGENTA logger.info('{}{}: {}{}'.format(color, argument, value, reset)) # Setup seeds torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) # Setup device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Setup Model model = (models.__dict__[args.arch](args)).to(device) model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) logger.info(model) # Setup data loader train_dataset = get_loader(args.dataset)(args, split='train') val_dataset = get_loader(args.dataset)(args, split='val') train_loader = data.DataLoader(train_dataset, batch_size=args.batch, num_workers=args.workers, pin_memory=True, sampler=None) val_loader = data.DataLoader(val_dataset, batch_size=args.batch, num_workers=args.workers, pin_memory=True, sampler=None) # Setup Metrics criterion = nn.SoftMarginLoss().to(device) # Setup optimizer, lr_scheduler and loss function optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[50, 120], gamma=0.3) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): logger.info("loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] max_acc = checkpoint['max_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logger.warning("no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True for epoch in range(args.start_epoch, args.maxepoch): scheduler.step(epoch=epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set if epoch % 2 == 1: acc = validate(val_loader, model, criterion, epoch) is_best = acc > max_acc max_acc = max(acc, max_acc) # remember best mean loss and save checkpoint save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'max_acc': max_acc, 'optimizer': optimizer.state_dict(), }, is_best, args.rundir)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() args = args_process(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) args.img_size = int(cfg['train_datasets']['search_size']) args.nms_threshold = float(cfg['train_datasets']['RPN_NMS']) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, opts=args, anchors=train_loader.dataset.anchors) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') global cur_lr if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) num_per_epoch = len(train_loader.dataset) // args.batch num_per_epoch_val = len(val_loader.dataset) // args.batch for epoch in range(args.start_epoch, args.epochs): lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') train_avg = AverageMeter() val_avg = AverageMeter() if dist_model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) train(train_loader, dist_model, optimizer, lr_scheduler, epoch, cfg, train_avg, num_per_epoch) if dist_model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) if (epoch + 1) % args.save_freq == 0: save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': dist_model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) validation(val_loader, dist_model, epoch, cfg, val_avg, num_per_epoch_val)
def main(): # 获取命令行参数信息 global args, logger, v_id args = parser.parse_args() # 获取配置文件中配置信息:主要包括网络结构,超参数等 cfg = load_config(args) # 初始化logxi信息,并将日志信息输入到磁盘文件中 init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) # 将相关的配置信息输入到日志文件中 logger = logging.getLogger('global') logger.info(args) # setup model # 加载网络模型架构 if args.arch == 'Custom': from custom import Custom model = Custom(anchors=cfg['anchors']) else: parser.error('invalid architecture: {}'.format(args.arch)) # 加载网络模型参数 if args.resume: assert isfile(args.resume), '{} is not a valid file'.format( args.resume) model = load_pretrain(model, args.resume) # 使用评估模式,将drop等激活 model.eval() # 硬件信息 device = torch.device('cuda' if ( torch.cuda.is_available() and not args.cpu) else 'cpu') model = model.to(device) # 加载数据集 setup dataset dataset = load_dataset(args.dataset) # 这三种数据支持掩膜 VOS or VOT? if args.dataset in ['DAVIS2016', 'DAVIS2017', 'ytb_vos'] and args.mask: vos_enable = True # enable Mask output else: vos_enable = False total_lost = 0 # VOT iou_lists = [] # VOS speed_list = [] # 对数据进行处理 for v_id, video in enumerate(dataset.keys(), start=1): if args.video != '' and video != args.video: continue # true 调用track_vos if vos_enable: # 如测试数据是['DAVIS2017', 'ytb_vos']时,会开启多目标跟踪 iou_list, speed = track_vos( model, dataset[video], cfg['hp'] if 'hp' in cfg.keys() else None, args.mask, args.refine, args.dataset in ['DAVIS2017', 'ytb_vos'], device=device) iou_lists.append(iou_list) # False 调用track_vot else: lost, speed = track_vot(model, dataset[video], cfg['hp'] if 'hp' in cfg.keys() else None, args.mask, args.refine, device=device) total_lost += lost speed_list.append(speed) # report final result if vos_enable: for thr, iou in zip(thrs, np.mean(np.concatenate(iou_lists), axis=0)): logger.info('Segmentation Threshold {:.2f} mIoU: {:.3f}'.format( thr, iou)) else: logger.info('Total Lost: {:d}'.format(total_lost)) logger.info('Mean Speed: {:.2f} FPS'.format(np.mean(speed_list)))
def train(): init_log('global', logging.INFO) logger = logging.getLogger("global") if args.img_dim == 300: cfg = (FEDet_VOC_300, FEDet_COCO_300)[args.dataset == 'COCO'] else: cfg = (FEDet_VOC_512, FEDet_COCO_512)[args.dataset == 'COCO'] if args.use_dataAug: train_transform = SSDAugmentation(cfg['min_dim'], MEANS) else: train_transform = Augmentation(cfg['min_dim'], MEANS) if args.dataset == 'COCO': if args.dataset_root == VOC_ROOT: if not os.path.exists(COCO_ROOT): parser.error('Must specify dataset_root if specifying dataset') logger.warning( "WARNING: Using default COCO dataset_root because " + "--dataset_root was not specified.") args.dataset_root = COCO_ROOT dataset = COCODetection(root=args.dataset_root, image_sets=[("2017", "train")], transform=train_transform, target_transform=COCOAnnotationTransform(), aux=args.use_aux) elif args.dataset == 'VOC': if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root') args.dataset_root = VOC_ROOT dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'trainval'), ('2012', 'trainval')], transform=train_transform, aux=args.use_aux) if not os.path.exists(args.save_folder): os.makedirs(args.save_folder) if args.visdom: import visdom viz = visdom.Visdom() if args.arch == 'FEDet': build_net = build_fedet(cfg, 'train', cfg['min_dim'], cfg['num_classes']) else: logger.error('architenture error!!!') return net = build_net logger.info(net) logger.info('---------config-----------') logger.info(cfg) if args.cuda: net = torch.nn.DataParallel(build_net) cudnn.benchmark = True if args.resume: logger.info('Resuming training, loading {}...'.format(args.resume)) build_net.load_weights(args.resume) else: vgg_weights = torch.load(args.pretrained_model + args.basenet) logger.info('Loading base network...') build_net.vgg.load_state_dict(vgg_weights) if not args.resume: logger.info('Initializing weights...') def weights_init(m): for key in m.state_dict(): if key.split('.')[-1] == 'weight': if 'conv' in key: init.kaiming_normal_(m.state_dict()[key], mode='fan_out') if 'bn' in key: m.state_dict()[key][...] = 1 elif key.split('.')[-1] == 'bias': m.state_dict()[key][...] = 0 # initialize newly added layers' weights with xavier method build_net.extras.apply(weights_init) build_net.loc.apply(weights_init) build_net.conf.apply(weights_init) if args.cuda: net.cuda() cudnn.benchmark = True optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion1 = MultiBoxLoss(cfg, 0.5, True, 0, True, 3, 0.5, False, args.cuda) criterion2 = nn.BCELoss(size_average=True).cuda() net.train() # loss counters loc_loss = 0 conf_loss = 0 ssm_loss = 0 ## SSM loss counter epoch = 0 logger.info('Loading the dataset...') epoch_size = len(dataset) // args.batch_size logger.info('Training FEDet on: %s' % dataset.name) logger.info('Trainging images size: %d' % len(dataset)) logger.info('Using the specified args:') logger.info(args) step_index = 0 if args.visdom: vis_title = 'SSD.PyTorch on ' + dataset.name vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] iter_plot = create_vis_plot(viz, 'Iteration', 'Loss', vis_title, vis_legend) epoch_plot = create_vis_plot(viz, 'Epoch', 'Loss', vis_title, vis_legend) data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate_fedet if args.use_aux else detection_collate, pin_memory=True) start_training_time = time.time() # create batch iterator batch_iterator = iter(data_loader) for iteration in range(args.start_iter, cfg['max_iter']): if iteration != 0 and (iteration % epoch_size == 0): epoch += 1 if args.visdom and iteration != 0 and (iteration % epoch_size == 0): update_vis_plot(viz, epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) # reset epoch loss counters loc_loss = 0 conf_loss = 0 ssm_loss = 0 if iteration in cfg['lr_steps']: step_index += 1 adjust_learning_rate(optimizer, args.gamma, step_index) # load train data try: if args.use_aux: images, targets, aux_targets = next(batch_iterator) else: images, targets = next(batch_iterator) except StopIteration: batch_iterator = iter(data_loader) if args.use_aux: images, targets, aux_targets = next(batch_iterator) else: images, targets = next(batch_iterator) if images.size(0) < args.batch_size: continue if args.cuda: images = Variable(images.cuda()) targets = [Variable(ann.cuda()) for ann in targets] if args.use_aux: aux_targets = Variable(aux_targets.cuda()) else: images = Variable(images) targets = [Variable(ann) for ann in targets] if args.use_aux: aux_targets = Variable(aux_targets) # forward t0 = time.time() assert images.size(2) == args.img_dim and images.size( 3) == args.img_dim out = net(images) # backprop optimizer.zero_grad() if args.use_aux: loss_loc, loss_cls = criterion1(out[2:], targets) loss_ssm1 = criterion2(out[0], aux_targets) loss_ssm2 = criterion2(out[1], aux_targets) loss = loss_loc + loss_cls + loss_ssm1.double() + loss_ssm2.double( ) loss.backward() optimizer.step() t1 = time.time() loc_loss = loss_loc.item() conf_loss = loss_cls.item() ssm_loss = loss_ssm1.item() + loss_ssm2.item() else: loss_loc, loss_cls = criterion1(out, targets) loss = loss_loc + loss_cls loss.backward() optimizer.step() t1 = time.time() loc_loss = loss_loc.item() conf_loss = loss_cls.item() ssm_loss = 0 if iteration % 10 == 0: logger.info( 'iter ' + repr(iteration) + '/' + str(cfg['max_iter']) + ' || epoch: ' + str(epoch + 1) + ' || LR: ' + repr(optimizer.param_groups[0]['lr']) + ' || total loss: %.4f || loc Loss: %.4f || conf Loss: %.4f || SSM loss: %.4f || ' % (loss.item(), loc_loss, conf_loss, ssm_loss) + 'timer: %.4f sec.' % (t1 - t0)) if args.visdom: update_vis_plot(viz, iteration, loss_loc.item(), loss_cls.item(), iter_plot, epoch_plot, 'append') if iteration != 0 and iteration % 10000 == 0: logger.info('Saving state, iter: %d' % iteration) ckpt_path = os.path.join( args.save_folder, args.arch + str(args.img_dim) + '_' + str(args.dataset) + '_' + str(iteration) + '.pth') torch.save(build_net.state_dict(), ckpt_path) torch.save(build_net.state_dict(), os.path.join(args.save_folder, 'models.pth')) total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logging.info("Total training time : {} ".format(total_time_str))
import torch.nn as nn import torch.backends.cudnn as cudnn from torch.optim.lr_scheduler import MultiStepLR from torch.utils.data import WeightedRandomSampler from models.build_model import ModelBuilder from datasets.fld_dataset import FLDDS, TransformBuilder from losses.wing_loss import WingLoss, SmoothWingLoss, WiderWingLoss, NormalizedWiderWingLoss, L2Loss, EuclideanLoss, NMELoss, LaplacianLoss from utils.log_helper import init_log from utils.vis_utils import save_result_imgs, save_result_nmes, save_result_lmks from utils.vis_utils import get_logger, add_scalar, get_model_graph, CsvHelper from utils.misc import save_checkpoint, print_speed, load_model, get_checkpoints from utils.metrics import MiscMeter, eval_NME from utils.imutils import refine_300W_landmarks init_log('FLD') logger = logging.getLogger('FLD') class FLD(object): def __init__(self, task_config): self.config = EasyDict(task_config) cudnn.benchmark = True self._build() def train(self): config = self.config.train_param num_kpts = config.num_kpts lr_scheduler = self.lr_scheduler train_loader = self.train_loader model = self.model
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) print("Init logger") logger = logging.getLogger('global') print(44) #logger.info("\n" + collect_env_info()) print(99) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) print(2) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) print(3) path = "/usr4/alg504/cliao25/siammask/experiments/siammask_base/snapshot/checkpoint_e{}.pth" for epoch in range(1,21): if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() print(4) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() #model.features.unfix((epoch - 1) / 20) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, epoch) filepath = path.format(epoch) assert os.path.isfile(filepath) model, _, _, _, _ = restore_from(model, optimizer, filepath) #model = load_pretrain(model, filepath) model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() model.train() device = torch.device('cuda') model = model.to(device) valid(val_loader, model, cfg) print("Done")
# board的路径 board_path = cfg.meta["board_path"] experiment_path = cfg.meta["experiment_path"] experiment_name = cfg.meta["experiment_name"] arch = cfg.meta["arch"] # 训练时候的一些参数 batch_size = cfg.train['batch_size'] epoches = cfg.train['epoches'] lr = cfg.train['lr'] # 初始化未来帧的数量 num_frame = cfg.model['input_num'] # print freq print_freq = cfg.train['print_freq'] # 初始化logger global_logger = init_log('global', level=logging.INFO) add_file_handler("global", os.path.join(os.getcwd(), 'logs', '{}.log'.format(experiment_name)), level=logging.DEBUG) # 打印cfg信息 cfg.log_dict() # 初始化avrager avg = AverageMeter() # cuda use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True
from omegaconf import OmegaConf from utils.loadConfig import load_cfg from utils.average_meter_helper import AverageMeter from utils.log_helper import init_log, add_file_handler, print_speed # get method from experiment.triplet_utils.get_loss import get_loss from experiment.triplet_utils.get_backbone import get_backbone from experiment.triplet_utils.get_optimizer import get_optimizer from experiment.triplet_utils.get_dataloader import get_train_dataloader # load model (more eazy way to get model.) from experiment.triplet_utils.load_model import load_model_test # init logger logger = init_log("global") def validation(epoch, log_interval, test_dataloader, model, loss, writer, device): """Validate on test dataset. Current validation is only for loss, pos|neg_distance. In future, we will add more validation like MAP5|10|50|100. (maybe in another file.) Args: log_interval: How many time will the logger log once. test_dataloader: It should not be none! A Triplet dataloader to validate data.
def main(): init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) params = {'penalty_k': args.penalty_k, 'window_influence': args.window_influence, 'lr': args.lr, 'instance_size': args.search_region} num_search = len(params['penalty_k']) * len(params['window_influence']) * \ len(params['lr']) * len(params['instance_size']) print(params) print(num_search) cfg = load_config(args) if args.arch == 'Custom': from custom import Custom model = Custom(anchors=cfg['anchors']) else: model = models.__dict__[args.arch](anchors=cfg['anchors']) if args.resume: assert isfile(args.resume), '{} is not a valid file'.format(args.resume) model = load_pretrain(model, args.resume) model.eval() model = model.to(device) default_hp = cfg.get('hp', {}) p = dict() p['network'] = model p['network_name'] = args.arch+'_'+args.resume.split('/')[-1].split('.')[0] p['dataset'] = args.dataset global ims, gt, image_files dataset_info = load_dataset(args.dataset) videos = list(dataset_info.keys()) np.random.shuffle(videos) for video in videos: print(video) if isfile('finish.flag'): return p['video'] = video ims = None image_files = dataset_info[video]['image_files'] gt = dataset_info[video]['gt'] np.random.shuffle(params['penalty_k']) np.random.shuffle(params['window_influence']) np.random.shuffle(params['lr']) for penalty_k in params['penalty_k']: for window_influence in params['window_influence']: for lr in params['lr']: for instance_size in params['instance_size']: p['hp'] = default_hp.copy() p['hp'].update({'penalty_k':penalty_k, 'window_influence':window_influence, 'lr':lr, 'instance_size': instance_size, }) tune(p)
from PIL import Image from PIL import ImageFile from torchvision import transforms from tqdm import tqdm from torch.utils.data import DataLoader from network import net from network import styler2 from sampler import InfiniteSamplerWrapper from torchvision.utils import save_image import time import logging from utils.log_helper import init_log from torch.autograd import Variable import mmcv init_log('global', logging.INFO) logger = logging.getLogger('global') cudnn.benchmark = True Image.MAX_IMAGE_PIXELS = None # Disable DecompressionBombError ImageFile.LOAD_TRUNCATED_IMAGES = True # Disable OSError: image file is truncated def adjust_learning_rate(optimizer, iteration_count): """Imitating the original implementation""" lr = args.lr / (1.0 + args.lr_decay * iteration_count) for param_group in optimizer.param_groups: param_group['lr'] = lr
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() args = args_process(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) args.img_size = int(cfg['train_datasets']['search_size']) args.nms_threshold = float(cfg['train_datasets']['RPN_NMS']) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, opts=args, anchors=train_loader.dataset.anchors) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) logger.info("\n" + collect_env_info()) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: model = models.__dict__[args.arch](anchors=cfg['anchors']) logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) logger.info(lr_scheduler) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format(args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from(model, optimizer, args.resume) dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() epoch = args.start_epoch if dist_model.module.features.unfix(epoch/args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{} resume lr {}'.format(epoch, cur_lr)) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): # init logger init_log('global', args.save_dir, logging.INFO) logger = logging.getLogger('global') # print arguments for arg in vars(args): logger.info("{}: {}".format(arg, getattr(args, arg))) # get device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # build dataloader and model train_loader, test_loader = build_nyu_dataloader(args.dataset_dir) opts = {"L": 5, "k": 12, "bn": True} model = D3(opts) # check GPU numbers and deploy parallel # parallel = False # if torch.cuda.device_count() > 1: # parallel = True # logger.info("Let's use {:d} GPUs!".format(torch.cuda.device_count())) # # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs # model = nn.DataParallel(model) model.to(device) logger.info("*" * 40) logger.info(model) logger.info("*" * 40) # optimizer settings optimizer = optim.Adam(model.parameters(), lr=args.lr) # optionally resume from a checkpoint # if args.resume: # if os.path.isfile(args.resume): # model, _, args.start_epoch = restore_from(model, optimizer, args.resume) # set the best model best_model_wts = copy.deepcopy(model.state_dict()) best_abs_rel = 0.0 logger.info("Start training...") # epoches = args.batches // train_loader.__len__() for epoch in range(args.epoches): for g in optimizer.param_groups: g['lr'] = args.lr * (1 - args.lr_decay)**(epoch // args.lr_decay_step) writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) t0 = time.time() train_one_epoch(train_loader, model, optimizer, device, epoch) t1 = time.time() if epoch % args.test_rate == 0: test_abs_rel = test_one_epoch(test_loader, model, device, epoch) if test_abs_rel < best_abs_rel: best_model_wts = copy.deepcopy(model.state_dict()) torch.cuda.empty_cache() if epoch % args.test_rate == 0: filename = os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch + 1)) save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, is_best=False, filename=filename) logger.info("Saved model : {}".format(filename)) print_speed(epoch, t1 - t0, args.epoches) save_checkpoint( { 'batch_num': epoch, 'state_dict': best_model_wts, 'optimizer': optimizer.state_dict() }, is_best=True, filename=os.path.join(args.save_dir, 'model_best.pth')) writer.close()
def main(): logger = logging.getLogger('global') global criterion_xent, criterion_triplet, criterion_center if os.path.exists(cfg.TRAIN.LOG_DIR): shutil.rmtree(cfg.TRAIN.LOG_DIR) os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) # log add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) summary_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) # visualise dataset, train_loader, _, _ = build_data_loader() model = BagReID_RESNET(dataset.num_train_bags) criterion_xent = CrossEntropyLabelSmooth(dataset.num_train_bags, use_gpu=cfg.CUDA) criterion_triplet = TripletLoss(margin=cfg.TRAIN.MARGIN) criterion_center = CenterLoss(dataset.num_train_bags, cfg.MODEL.GLOBAL_FEATS + cfg.MODEL.PART_FEATS, use_gpu=cfg.CUDA) if cfg.TRAIN.OPTIM == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) else: optimizer = torch.optim.Adam(model.parameters(), lr=cfg.SOLVER.LEARNING_RATE, weight_decay=cfg.SOLVER.WEIGHT_DECAY) center_optimizer = torch.optim.SGD(criterion_center.parameters(), lr=cfg.SOLVER.LEARNING_RATE_CENTER) optimizers = [optimizer, center_optimizer] schedulers = build_lr_schedulers(optimizers) if cfg.CUDA: model.cuda() if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model, device_ids=cfg.DEVICES) logger.info("model prepare done") # start training for epoch in range(cfg.TRAIN.NUM_EPOCHS): train(epoch, train_loader, model, criterion, optimizers, summary_writer) for scheduler in schedulers: scheduler.step() # skip if not save model if cfg.TRAIN.EVAL_STEP > 0 and (epoch + 1) % cfg.TRAIN.EVAL_STEP == 0 \ or (epoch + 1) == cfg.TRAIN.NUM_EPOCHS: if cfg.CUDA and torch.cuda.device_count() > 1: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'epoch': epoch + 1 }, is_best=False, save_dir=cfg.TRAIN.SNAPSHOT_DIR, filename='checkpoint_ep' + str(epoch + 1) + '.pth')
def main(): """ 基础网络的训练 :return: """ global args, best_acc, tb_writer, logger args = parser.parse_args() # 初始化日志信息 init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) # 获取log信息 logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) # 获取配置信息 cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # 构建数据集 train_loader, val_loader = build_data_loader(cfg) # 加载训练网络 if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) # 加载预训练网络 if args.pretrained: model = load_pretrain(model, args.pretrained) # GPU版本 # model = model.cuda() # dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() # 网络模型 dist_model = torch.nn.DataParallel(model) # 模型参数的更新比例 if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) # 获取优化器和学习率的更新策略 optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint 加载模型 if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) # GPU # dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() dist_model = torch.nn.DataParallel(model) logger.info(lr_scheduler) logger.info('model prepare done') # 模型训练 train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': model = Custom(anchors=cfg['anchors']) elif args.arch == 'Custom_Sky': model = Custom_Sky(anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) # print(summary(model=model, input_size=(3, 511, 511), batch_size=1)) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: print(args.resume) assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(args): cfg_from_file(args.config) cfg.save_name = args.save_name cfg.save_path = args.save_path cfg.resume_file = args.resume_file cfg.config = args.config cfg.batch_size = args.batch_size cfg.num_workers = args.num_workers save_path = join(args.save_path, args.save_name) if not exists(save_path): makedirs(save_path) resume_file = args.resume_file init_log('global', logging.INFO) add_file_handler('global', os.path.join(save_path, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) start_epoch = 0 model = ModelBuilder().cuda() if cfg.backbone.pretrained: load_pretrain(model.backbone, join('pretrained_net', cfg.backbone.pretrained)) train_dataset = Datasets() val_dataset = Datasets(is_train=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=False, drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=False, drop_last=True) if resume_file: if isfile(resume_file): logger.info("=> loading checkpoint '{}'".format(resume_file)) model, start_epoch = restore_from(model, resume_file) start_epoch = start_epoch + 1 for i in range(start_epoch): train_loader.dataset.shuffle() logger.info("=> loaded checkpoint '{}' (epoch {})".format( resume_file, start_epoch - 1)) else: logger.info("=> no checkpoint found at '{}'".format(resume_file)) ngpus = torch.cuda.device_count() is_dataparallel = False if ngpus > 1: model = torch.nn.DataParallel(model, list(range(ngpus))).cuda() is_dataparallel = True if is_dataparallel: optimizer, lr_scheduler = build_opt_lr(model.module, start_epoch) else: optimizer, lr_scheduler = build_opt_lr(model, start_epoch) logger.info(lr_scheduler) logger.info("model prepare done") if args.log: writer = SummaryWriter(comment=args.save_name) for epoch in range(start_epoch, cfg.train.epoch): train_loader.dataset.shuffle() if (epoch == np.array(cfg.backbone.unfix_steps) ).sum() > 0 or epoch == cfg.train.pretrain_epoch: if is_dataparallel: optimizer, lr_scheduler = build_opt_lr(model.module, epoch) else: optimizer, lr_scheduler = build_opt_lr(model, epoch) lr_scheduler.step(epoch) record_dict_train = train(train_loader, model, optimizer, epoch) record_dict_val = validate(val_loader, model, epoch) message = 'Train Epoch: [{0}]\t'.format(epoch) for k, v in record_dict_train.items(): message += '{name:s} {loss:.4f}\t'.format(name=k, loss=v) logger.info(message) message = 'Val Epoch: [{0}]\t'.format(epoch) for k, v in record_dict_val.items(): message += '{name:s} {loss:.4f}\t'.format(name=k, loss=v) logger.info(message) if args.log: for k, v in record_dict_train.items(): writer.add_scalar('train/' + k, v, epoch) for k, v in record_dict_val.items(): writer.add_scalar('val/' + k, v, epoch) if is_dataparallel: save_checkpoint( { 'epoch': epoch, 'state_dict': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'cfg': cfg }, epoch, save_path) else: save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'cfg': cfg }, epoch, save_path)