def main(): rank, world_size = dist_init() # rank = 0 logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # create model model = ModelBuilder().train() dist_model = nn.DataParallel(model).cuda() # load pretrained backbone weights if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain(model.backbone, backbone_path) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build dataset loader train_loader = build_data_loader() # build optimizer and lr_scheduler optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg.TRAIN.START_EPOCH) # resume training if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) # load pretrain elif cfg.TRAIN.PRETRAINED: load_pretrain(model, cfg.TRAIN.PRETRAINED) dist_model = nn.DataParallel(model) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)
def main(): rank, world_size = dist_init() logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # create model model = Template_Enhance().cuda().train() dist_model = DistModule(model) # load pretrained SiamRPN++ model if cfg.TSA.MODELBUILD_PATH: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.TSA.MODELBUILD_PATH) load_pretrain(model.model, backbone_path) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build datasets loader train_loader, val_loader = build_data_loader() # build optimizer and lr_scheduler optimizer, scheduler = build_opt_lr(dist_model.module) logger.info(scheduler) logger.info("model prepare done") # start estimation # train(train_loader, dist_model, optimizer, scheduler, tb_writer) estimation = Estimator(train_loader, val_loader, dist_model, optimizer, scheduler, tb_writer) # estimation.evaluate(5000, 'loss') estimation.train()
def main(): rank, world_size = dist_init() logger.info("init done") # load cfg cfg.merge_from_file( args.cfg) #将core下面的config配置文件与experiments里面的配置文件融合,因实验不同,修改默认参数 if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): #在tools 路径下建立log日志文件家 os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) #添加到log日志里面 if cfg.TRAIN.LOG_DIR: add_file_handler( 'global', #在logs文件夹下建立log.txt文本记录控制台信息 os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # create model model = ModelBuilder().cuda().train() dist_model = DistModule(model) # load pretrained backbone weights if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain(model.backbone, backbone_path) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build dataset loader train_loader = build_data_loader() # build optimizer and lr_scheduler,使用SGD优化器(适合探索新的结构),以START_EPOCH为基准设置学习率,方便中断后再次训练 optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg.TRAIN.START_EPOCH) # resume training if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) # load pretrain elif cfg.TRAIN.PRETRAINED: load_pretrain(model, cfg.TRAIN.PRETRAINED) dist_model = DistModule(model) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)
def main(): rank, world_size = dist_init() logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) if cfg.LATENTS.HP: cfg.TRAIN.BASE_LR = args.base_lr print('base_lr = {}'.format(cfg.TRAIN.BASE_LR)) cfg.LATENTS.AUGMENT_LAMBDA = args.aug_lambda print('aug_lambda = {}'.format(cfg.LATENTS.AUGMENT_LAMBDA )) cfg.TRAIN.THR_HIGH = args.thr_high print('threshold high: {}'.format(cfg.TRAIN.THR_HIGH)) if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # create model model = ModelBuilder().cuda().train() dist_model = DistModule(model) # load pretrained backbone weights if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain_rpn_last(model, backbone_path) # fix the SiamRPN base model fixed_params, fixed_modules = get_layers_in_base_model(dist_model.module, cfg.RPN.TYPE) set_fixed_params(fixed_params, fixed_modules, True) # print_params_name(dist_model.module) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build optimizer and lr_scheduler optimizer, lr_scheduler = build_opt_lr_latent_last(dist_model.module, cfg.TRAIN.START_EPOCH) # build dataset loader train_loader = build_data_loader() # resume training if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) dist_model = DistModule(model) # build optimizer and lr_scheduler optimizer, lr_scheduler = build_opt_lr_latent_last(dist_model.module, cfg.TRAIN.START_EPOCH) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)