def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() # args通过解析获得的 init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') # 实例化一个记录器 logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps( cfg, indent=4))) # 转变成json格式的文件,缩进4格 if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() # 模型转移到GPU上 dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() # 多GPU训练 if args.resume and args.start_epoch != 0: # 这是在干啥?蒙蔽了!!!!! model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # 如何构建优化器和学习策略??? # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) # 返回一个logger对象,logging_INFO是日志的等级 if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') # 获取上面初始化的logger对象 logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) # 返回修改后的配置文件对象 logger.info("config \n{}".format(json.dumps(cfg, indent=4))) #json.loads()是将str转化成dict格式,json.dumps()是将dict转化成str格式。 if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format(args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from(model, optimizer, args.resume) dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': model = Custom(anchors=cfg['anchors']) elif args.arch == 'Custom_Sky': model = Custom_Sky(anchors=cfg['anchors']) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) # print(summary(model=model, input_size=(3, 511, 511), batch_size=1)) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: print(args.resume) assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) logger.info("\n" + collect_env_info()) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: model = models.__dict__[args.arch](anchors=cfg['anchors']) logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) logger.info(lr_scheduler) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format(args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from(model, optimizer, args.resume) dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() epoch = args.start_epoch if dist_model.module.features.unfix(epoch/args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger.info('epoch:{} resume lr {}'.format(epoch, cur_lr)) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() args = args_process(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) args.img_size = int(cfg['train_datasets']['search_size']) args.nms_threshold = float(cfg['train_datasets']['RPN_NMS']) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, opts=args, anchors=train_loader.dataset.anchors) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') global cur_lr if not os.path.exists(args.save_dir): # makedir/save model os.makedirs(args.save_dir) num_per_epoch = len(train_loader.dataset) // args.batch num_per_epoch_val = len(val_loader.dataset) // args.batch for epoch in range(args.start_epoch, args.epochs): lr_scheduler.step(epoch) cur_lr = lr_scheduler.get_cur_lr() logger = logging.getLogger('global') train_avg = AverageMeter() val_avg = AverageMeter() if dist_model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) train(train_loader, dist_model, optimizer, lr_scheduler, epoch, cfg, train_avg, num_per_epoch) if dist_model.module.features.unfix(epoch / args.epochs): logger.info('unfix part model.') optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg, args, epoch) if (epoch + 1) % args.save_freq == 0: save_checkpoint( { 'epoch': epoch, 'arch': args.arch, 'state_dict': dist_model.module.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'anchor_cfg': cfg['anchors'] }, False, os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)), os.path.join(args.save_dir, 'best.pth')) validation(val_loader, dist_model, epoch, cfg, val_avg, num_per_epoch_val)
def main(): """ 基础网络的训练 :return: """ global args, best_acc, tb_writer, logger args = parser.parse_args() # 初始化日志信息 init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) # 获取log信息 logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) # 获取配置信息 cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # 构建数据集 train_loader, val_loader = build_data_loader(cfg) # 加载训练网络 if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() logger.info(model) # 加载预训练网络 if args.pretrained: model = load_pretrain(model, args.pretrained) # GPU版本 # model = model.cuda() # dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() # 网络模型 dist_model = torch.nn.DataParallel(model) # 模型参数的更新比例 if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) # 获取优化器和学习率的更新策略 optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint 加载模型 if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) # GPU # dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() dist_model = torch.nn.DataParallel(model) logger.info(lr_scheduler) logger.info('model prepare done') # 模型训练 train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) print("Init logger") logger = logging.getLogger('global') print(44) #logger.info("\n" + collect_env_info()) print(99) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) print(2) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) print(3) path = "/usr4/alg504/cliao25/siammask/experiments/siammask_base/snapshot/checkpoint_e{}.pth" for epoch in range(1,21): if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, anchors=cfg['anchors']) else: exit() print(4) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() #model.features.unfix((epoch - 1) / 20) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, epoch) filepath = path.format(epoch) assert os.path.isfile(filepath) model, _, _, _, _ = restore_from(model, optimizer, filepath) #model = load_pretrain(model, filepath) model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda() model.train() device = torch.device('cuda') model = model.to(device) valid(val_loader, model, cfg) print("Done")
def main(): global args, best_acc, tb_writer, logger args = parser.parse_args() args = args_process(args) init_log('global', logging.INFO) if args.log != "": add_file_handler('global', args.log, logging.INFO) logger = logging.getLogger('global') logger.info("\n" + collect_env_info()) logger.info(args) cfg = load_config(args) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) if args.log_dir: tb_writer = SummaryWriter(args.log_dir) else: tb_writer = Dummy() # build dataset train_loader, val_loader = build_data_loader(cfg) args.img_size = int(cfg['train_datasets']['search_size']) args.nms_threshold = float(cfg['train_datasets']['RPN_NMS']) if args.arch == 'Custom': from custom import Custom model = Custom(pretrain=True, opts=args, anchors=train_loader.dataset.anchors) else: exit() logger.info(model) if args.pretrained: model = load_pretrain(model, args.pretrained) model = model.cuda() dist_model = torch.nn.DataParallel(model, list(range( torch.cuda.device_count()))).cuda() if args.resume and args.start_epoch != 0: model.features.unfix((args.start_epoch - 1) / args.epochs) optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch) # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume), '{} is not a valid file'.format( args.resume) model, optimizer, args.start_epoch, best_acc, arch = restore_from( model, optimizer, args.resume) dist_model = torch.nn.DataParallel( model, list(range(torch.cuda.device_count()))).cuda() logger.info(lr_scheduler) logger.info('model prepare done') train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)