def main(): # ===> 获取配置文件参数 cfg = parse_args() os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True) logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log')) # ===> 训练信息的打印 train_options = cfg.train_options logger.info(cfg) # ===> to_use_device = torch.device( train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu') set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True) # ===> build network net = build_model(cfg['model']) # ===> 模型初始化及模型部署到对应的设备 net.apply(weight_init) # if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(to_use_device) net.train() # ===> get fine tune layers params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage']) # ===> solver and lr scheduler optimizer = build_optimizer(net.parameters(), cfg['optimizer']) scheduler = build_scheduler(optimizer, cfg['lr_scheduler']) # ===> whether to resume from checkpoint resume_from = train_options['resume_from'] if resume_from: net, _resumed_optimizer,global_state = load_checkpoint(net, resume_from, to_use_device, optimizer, third_name=train_options['third_party_name']) if _resumed_optimizer: optimizer = _resumed_optimizer logger.info(f'net resume from {resume_from}') else: global_state = {} logger.info(f'net resume from scratch.') # ===> loss function loss_func = build_loss(cfg['loss']) loss_func = loss_func.to(to_use_device) with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file: cfg.dataset.alphabet = ''.join([s.strip('\n') for s in file.readlines()]) # ===> data loader cfg.dataset.train.dataset.alphabet = cfg.dataset.alphabet train_loader = build_dataloader(cfg.dataset.train) cfg.dataset.eval.dataset.alphabet = cfg.dataset.alphabet eval_loader = build_dataloader(cfg.dataset.eval) # ===> train train(net, optimizer, scheduler, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger)
def main(): # ===> 获取配置文件参数 cfg = parse_args() os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True) # ===> 训练信息的打印 logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log')) logger.info(cfg) # ===> train_options = cfg.train_options to_use_device = torch.device( train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu') set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True) # ===> build network net = build_model(cfg['model']) # ===> 模型部署到对应的设备 net = nn.DataParallel(net) net = net.to(to_use_device) # ===> 创建metric metric = build_metric(cfg['metric']) # ===> get fine tune layers # params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage']) # ===> solver and lr scheduler optimizer = build_optimizer(net.parameters(), cfg['optimizer']) net.train() net.module.model_dict['Teacher'].eval() # ===> whether to resume from checkpoint resume_from = train_options['resume_from'] if resume_from: net, _resumed_optimizer, global_state = load_checkpoint(net, resume_from, to_use_device, optimizer) if _resumed_optimizer: optimizer = _resumed_optimizer logger.info(f'net resume from {resume_from}') else: global_state = {} logger.info(f'net resume from scratch.') # ===> loss function loss_func = build_loss(cfg['loss']) loss_func = loss_func.to(to_use_device) # ===> data loader train_loader = build_dataloader(cfg.dataset.train) eval_loader = build_dataloader(cfg.dataset.eval) # ===> post_process post_process = build_post_process(cfg['post_process']) # ===> train train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger, post_process,metric)
def main(): # ===> 获取配置文件参数 cfg = parse_args() os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True) logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log')) # ===> 训练信息的打印 train_options = cfg.train_options logger.info(cfg) # ===> to_use_device = torch.device( train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu') set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True) # ===> build network net = build_model(cfg['model']) # ===> 模型初始化及模型部署到对应的设备 # net.apply(weight_init) # 使用 pretrained时,注释掉这句话 # if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(to_use_device) net.train() # ===> get fine tune layers params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage']) # ===> solver and lr scheduler optimizer = build_optimizer(net.parameters(), cfg['optimizer']) # ===> whether to resume from checkpoint resume_from = train_options['resume_from'] if resume_from: net, current_epoch, _resumed_optimizer = load_checkpoint(net, resume_from, to_use_device, optimizer, third_name=train_options['third_party_name']) if _resumed_optimizer: optimizer = _resumed_optimizer logger.info(f'net resume from {resume_from}') else: current_epoch = 0 logger.info(f'net resume from scratch.') # ===> loss function loss_func = build_loss(cfg['loss']) loss_func = loss_func.to(to_use_device) # ===> data loader train_loader = build_dataloader(cfg.dataset.train) eval_loader = build_dataloader(cfg.dataset.eval) # post_process post_process = build_post_process(cfg['post_process']) # ===> train train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, current_epoch, logger, post_process)
def main(): args = parse_args() cfg_path = args.config cfg = Config.fromfile(cfg_path) # 通用配置 global_config = cfg.options # build model model = build_model(cfg.model) device, gpu_ids = select_device(global_config.gpu_ids) load_checkpoint(model, args.model_path,map_location=device) model = model.to(device) model.device = device eval_dataset = build_dataset(cfg.test_data.dataset) eval_loader = build_dataloader(eval_dataset, loader_cfg=cfg.test_data.loader) # build postprocess postprocess = build_postprocess(cfg.postprocess) # build metric metric = build_metrics(cfg.metric) result_metirc = eval(model, eval_loader, postprocess, metric) print(result_metirc)
def main(): args = parse_args() cfg_path = args.config cfg = Config.fromfile(cfg_path) # set pretrained model None cfg.model.pretrained = None # build postprocess postprocess = build_postprocess(cfg.postprocess) # for rec cal head number if hasattr(postprocess, 'character'): char_num = len(getattr(postprocess, 'character')) cfg.model.head.n_class = char_num eval_dataset = build_dataset(cfg.test_data.dataset) eval_loader = build_dataloader(eval_dataset, loader_cfg=cfg.test_data.loader) # build metric metric = build_metrics(cfg.metric) mode = args.mode if mode == 'torch': # build model model = build_model(cfg.model) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') load_checkpoint(model, args.model_path, map_location=device) if args.simple: (filepath, filename) = os.path.split(args.model_path) simple_model_path = os.path.join(filepath, 'sim_{}'.format(filename)) save_checkpoint(model, simple_model_path) model = model.to(device) model.device = device result_metirc = eval(model, eval_loader, postprocess, metric) elif mode == 'engine': engine_path = args.engine_path model = TRTModel(engine_path) result_metirc = engine_eval(model, eval_loader, postprocess, metric) print(result_metirc)
def main(): args = parse_args() cfg_path = args.config cfg = Config.fromfile(cfg_path) global_config = cfg.options # 通用配置 # local_rank = 0 is logger global_config['local_rank'] = args.local_rank # amp train if args.amp: global_config['is_amp'] = True else: global_config['is_amp'] = False # ema train if args.ema: global_config['is_ema'] = True else: global_config['is_ema'] = False # set cudnn_benchmark,如数据size一致能加快训练 if global_config.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True if global_config.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None global_config.work_dir = osp.join( './work_dirs', osp.splitext(osp.basename(args.config))[0]) # create work_dir file_util.mkdir_or_exist(global_config.work_dir) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(global_config.work_dir, '{}.log'.format(timestamp)) logger = get_logger(name='ocr', log_file=log_file) # # log env info if args.local_rank == 0: env_info_dict = collect_env() env_info = '\n'.join([('{}: {}'.format(k, v)) for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) ## log some basic info logger.info('Config:\n{}'.format(cfg.text)) # set random seeds logger.info('Set random seed to {}, deterministic: {}'.format( global_config.seed, args.deterministic)) # set random seed set_random_seed(global_config.seed, deterministic=args.deterministic) # select device # dist init if torch.cuda.device_count() > 1 and args.distributed: device = init_dist(launcher='pytorch', backend='nccl', rank=args.local_rank) global_config['distributed'] = True else: device, gpu_ids = select_device(global_config.gpu_ids) global_config.gpu_ids = gpu_ids global_config['distributed'] = False # build train dataset train_dataset = build_dataset(cfg.train_data.dataset) train_loader = build_dataloader(train_dataset, loader_cfg=cfg.train_data.loader, distributed=global_config['distributed']) # if is eval , build eval dataloader,postprocess,metric # 移动到前面,由于rec-head的输出需要用postprocess计算 if global_config.is_eval: eval_dataset = build_dataset(cfg.test_data.dataset) eval_loader = build_dataloader( eval_dataset, loader_cfg=cfg.test_data.loader, distributed=global_config['distributed']) # build postprocess postprocess = build_postprocess(cfg.postprocess) # build metric metric = build_metrics(cfg.metric) else: eval_loader = None postprocess = None metric = None # for rec cal head number if hasattr(postprocess, 'character'): char_num = len(getattr(postprocess, 'character')) cfg.model.head.n_class = char_num # build model model = build_model(cfg.model) model = model.to(device) # set model to device if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and global_config['distributed'] == True: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) device = torch.device('cuda', args.local_rank) is_cuda = True elif device.type != 'cpu' and global_config[ 'distributed'] == False and len(gpu_ids) >= 1: model = nn.DataParallel(model, device_ids=global_config.gpu_ids) model.gpu_ids = gpu_ids is_cuda = True else: is_cuda = False global_config['is_cuda'] = is_cuda model.device = device # build optimizer optimizer = build_optimizer(cfg.optimizer, model) # build lr_scheduler lr_scheduler = build_lr_scheduler(cfg.lr_scheduler, optimizer) # build loss criterion = build_loss(cfg.loss).to(device) runner = TrainRunner(global_config, model, optimizer, lr_scheduler, postprocess, criterion, train_loader, eval_loader, metric, logger) # # Resume if global_config.resume_from is not None and args.resume: runner.resume(global_config.resume_from, map_location=device) if global_config.load_from is not None: runner.load_checkpoint(global_config.load_from, map_location=device) runner.run()
def main(): # ===> 获取配置文件参数 parser = argparse.ArgumentParser(description='train') parser.add_argument('--config', type=str, default='config/det.json', help='train config file path') cfg = parser.parse_args() with open(cfg.config) as fin: cfg = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) # cfg = parse_args() os.makedirs(cfg.train_options.checkpoint_save_dir, exist_ok=True) logger = get_logger('torchocr', log_file=os.path.join( cfg.train_options.checkpoint_save_dir, 'train.log')) # ===> 训练信息的打印 train_options = cfg.train_options logger.info(cfg) # ===> to_use_device = torch.device( train_options.device if torch.cuda.is_available() and ( 'cuda' in train_options.device) else 'cpu') # set_random_seed(cfg.SEED, 'cuda' in train_options.device, deterministic=True) # ===> build network net = build_model(cfg.model) # ===> 模型初始化及模型部署到对应的设备 # net.apply(weight_init) # 使用 pretrained时,注释掉这句话 # if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net = net.to(to_use_device) net.train() # ===> get fine tune layers params_to_train = get_fine_tune_params(net, train_options.fine_tune_stage) # ===> solver and lr scheduler optimizer = build_optimizer(net.parameters(), cfg.optimizer) # ===> whether to resume from checkpoint resume_from = train_options.resume_from if resume_from: net, _resumed_optimizer, global_state = load_checkpoint( net, resume_from, to_use_device, optimizer, third_name=train_options.third_party_name) if _resumed_optimizer: optimizer = _resumed_optimizer logger.info(f'net resume from {resume_from}') else: global_state = {} logger.info(f'net resume from scratch.') # ===> loss function loss_func = build_loss(cfg.loss) loss_func = loss_func.to(to_use_device) # ===> data loader train_loader = build_dataloader(cfg.dataset.train) eval_loader = build_dataloader(cfg.dataset.eval) # post_process post_process = build_post_process(cfg.post_process) # ===> train train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger, post_process)
model_dict = net.state_dict() ckpt = torch.load(resume_from, map_location='cpu') pretrained_dict = ckpt['state_dict'] # txt_file = os.path.join('test_results/', 'pretrainedmodel_state.txt') # txt_f = open(txt_file, 'w') # txt_f.write('############## Model Dict ################' + '\n') # for j in model_dict: # txt_f.write(str(j) +'\n' ) # # txt_f.write('############## Pretrained Dict ################' + '\n') # # for k in pretrained_dict: # # txt_f.write(str(k) +'\n') # txt_f.close() net, _, global_state = load_checkpoint(net, resume_from, to_use_device, _optimizers=None, third_name=train_options['third_party_name']) # ===> loss function loss_func = build_loss(cfg['loss']) loss_func = loss_func.to(to_use_device) # ===> data loader train_loader = build_dataloader(cfg.dataset.train) eval_loader = build_dataloader(cfg.dataset.eval) # post_process post_process = build_post_process(cfg['post_process']) # ===> train train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger, post_process)
def main(): # ===> 获取配置文件参数 # cfg = parse_args() parser = argparse.ArgumentParser(description='train') parser.add_argument('-c', '--config', type=str, default='config/recrbt3.json', help='train config file path') args = parser.parse_args() with open(args.config) as fin: cfg = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) os.makedirs(cfg.train_options.checkpoint_save_dir, exist_ok=True) logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options.checkpoint_save_dir, 'train.log')) # ===> 训练信息的打印 train_options = cfg.train_options logger.info(cfg) # ===> # to_use_device = torch.device( # train_options.device if torch.cuda.is_available() and ('cuda' in train_options.device) else 'cpu') # set_random_seed(cfg.SEED, 'cuda' in train_options.device, deterministic=True) # ===> build network net = build_model(cfg.model) # ===> 模型初始化及模型部署到对应的设备 # net.apply(weight_init) # # if torch.cuda.device_count() > 1: # net = nn.DataParallel(net) # net = net.to(to_use_device) net.train() # ===> get fine tune layers # params_to_train = get_fine_tune_params(net, train_options.fine_tune_stage) # ===> solver and lr scheduler optimizer = build_optimizer(net.parameters(), cfg.optimizer) scheduler = build_scheduler(optimizer, cfg.lr_scheduler) # ===> whether to resume from checkpoint resume_from = train_options.resume_from if resume_from: net, _resumed_optimizer, global_state = load_checkpoint(net, resume_from, 0, optimizer, third_name=train_options.third_party_name) if _resumed_optimizer: optimizer = _resumed_optimizer logger.info(f'net resume from {resume_from}') else: global_state = {} logger.info(f'net resume from scratch.') # ===> loss function loss_func = build_loss(cfg.loss) #loss_func = loss_func.to(to_use_device) if "dogclass.txt" in cfg.dataset.alphabet: with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file: cfg.dataset.alphabet = [s.strip('\n') for s in file.readlines()] else: with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file: cfg.dataset.alphabet = ''.join([s.strip('\n') for s in file.readlines()]) # ===> data loader cfg.dataset.train.dataset.alphabet = cfg.dataset.alphabet train_loader = build_dataloader(cfg.dataset.train) cfg.dataset.eval.dataset.alphabet = cfg.dataset.alphabet eval_loader = build_dataloader(cfg.dataset.eval) # ===> train train(net, optimizer, scheduler, loss_func, train_loader, eval_loader, 0, cfg, global_state, logger)