Ejemplo n.º 1
0
def main():
    # ===> 获取配置文件参数
    cfg = parse_args()
    os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True)
    logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log'))

    # ===> 训练信息的打印
    train_options = cfg.train_options
    logger.info(cfg)
    # ===>
    to_use_device = torch.device(
        train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu')
    set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True)

    # ===> build network
    net = build_model(cfg['model'])

    # ===> 模型初始化及模型部署到对应的设备
    net.apply(weight_init)
    # if torch.cuda.device_count() > 1:
    net = nn.DataParallel(net)
    net = net.to(to_use_device)
    net.train()

    # ===> get fine tune layers
    params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage'])
    # ===> solver and lr scheduler
    optimizer = build_optimizer(net.parameters(), cfg['optimizer'])
    scheduler = build_scheduler(optimizer, cfg['lr_scheduler'])

    # ===> whether to resume from checkpoint
    resume_from = train_options['resume_from']
    if resume_from:
        net, _resumed_optimizer,global_state = load_checkpoint(net, resume_from, to_use_device, optimizer,
                                                                 third_name=train_options['third_party_name'])
        if _resumed_optimizer:
            optimizer = _resumed_optimizer
        logger.info(f'net resume from {resume_from}')
    else:
        global_state = {}
        logger.info(f'net resume from scratch.')

    # ===> loss function
    loss_func = build_loss(cfg['loss'])
    loss_func = loss_func.to(to_use_device)

    with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file:
        cfg.dataset.alphabet = ''.join([s.strip('\n') for s in file.readlines()])

    # ===> data loader
    cfg.dataset.train.dataset.alphabet = cfg.dataset.alphabet
    train_loader = build_dataloader(cfg.dataset.train)
    cfg.dataset.eval.dataset.alphabet = cfg.dataset.alphabet
    eval_loader = build_dataloader(cfg.dataset.eval)

    # ===> train
    train(net, optimizer, scheduler, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger)
Ejemplo n.º 2
0
def main():
    # ===> 获取配置文件参数
    cfg = parse_args()
    os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True)
    # ===> 训练信息的打印
    logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log'))
    logger.info(cfg)

    # ===>
    train_options = cfg.train_options
    to_use_device = torch.device(
        train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu')
    set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True)

    # ===> build network
    net = build_model(cfg['model'])

    # ===> 模型部署到对应的设备
    net = nn.DataParallel(net)
    net = net.to(to_use_device)


    # ===> 创建metric
    metric = build_metric(cfg['metric'])

    # ===> get fine tune layers
    # params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage'])
    # ===> solver and lr scheduler
    optimizer = build_optimizer(net.parameters(), cfg['optimizer'])
    net.train()
    net.module.model_dict['Teacher'].eval()
    # ===> whether to resume from checkpoint
    resume_from = train_options['resume_from']
    if resume_from:
        net, _resumed_optimizer, global_state = load_checkpoint(net, resume_from, to_use_device, optimizer)
        if _resumed_optimizer:
            optimizer = _resumed_optimizer
        logger.info(f'net resume from {resume_from}')
    else:
        global_state = {}
        logger.info(f'net resume from scratch.')

    # ===> loss function
    loss_func = build_loss(cfg['loss'])
    loss_func = loss_func.to(to_use_device)

    # ===> data loader
    train_loader = build_dataloader(cfg.dataset.train)
    eval_loader = build_dataloader(cfg.dataset.eval)

    # ===> post_process
    post_process = build_post_process(cfg['post_process'])
    # ===> train
    train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger, post_process,metric)
Ejemplo n.º 3
0
def main():
    # ===> 获取配置文件参数
    cfg = parse_args()
    os.makedirs(cfg.train_options['checkpoint_save_dir'], exist_ok=True)
    logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options['checkpoint_save_dir'], 'train.log'))

    # ===> 训练信息的打印
    train_options = cfg.train_options
    logger.info(cfg)
    # ===>
    to_use_device = torch.device(
        train_options['device'] if torch.cuda.is_available() and ('cuda' in train_options['device']) else 'cpu')
    set_random_seed(cfg['SEED'], 'cuda' in train_options['device'], deterministic=True)

    # ===> build network
    net = build_model(cfg['model'])

    # ===> 模型初始化及模型部署到对应的设备
    # net.apply(weight_init) # 使用 pretrained时,注释掉这句话
    # if torch.cuda.device_count() > 1:
    net = nn.DataParallel(net)
    net = net.to(to_use_device)
    net.train()

    # ===> get fine tune layers
    params_to_train = get_fine_tune_params(net, train_options['fine_tune_stage'])
    # ===> solver and lr scheduler
    optimizer = build_optimizer(net.parameters(), cfg['optimizer'])

    # ===> whether to resume from checkpoint
    resume_from = train_options['resume_from']
    if resume_from:
        net, current_epoch, _resumed_optimizer = load_checkpoint(net, resume_from, to_use_device, optimizer,
                                                                 third_name=train_options['third_party_name'])
        if _resumed_optimizer:
            optimizer = _resumed_optimizer
        logger.info(f'net resume from {resume_from}')
    else:
        current_epoch = 0
        logger.info(f'net resume from scratch.')

    # ===> loss function
    loss_func = build_loss(cfg['loss'])
    loss_func = loss_func.to(to_use_device)

    # ===> data loader
    train_loader = build_dataloader(cfg.dataset.train)
    eval_loader = build_dataloader(cfg.dataset.eval)

    # post_process
    post_process = build_post_process(cfg['post_process'])
    # ===> train
    train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, current_epoch, logger, post_process)
Ejemplo n.º 4
0
def main():
    args = parse_args()
    cfg_path = args.config
    cfg = Config.fromfile(cfg_path)

    # 通用配置
    global_config = cfg.options

    # build model
    model = build_model(cfg.model)
    device, gpu_ids = select_device(global_config.gpu_ids)
    load_checkpoint(model, args.model_path,map_location=device)
    model = model.to(device)

    model.device = device

    eval_dataset = build_dataset(cfg.test_data.dataset)
    eval_loader = build_dataloader(eval_dataset, loader_cfg=cfg.test_data.loader)
    # build postprocess
    postprocess = build_postprocess(cfg.postprocess)
    # build metric
    metric = build_metrics(cfg.metric)

    result_metirc = eval(model, eval_loader, postprocess, metric)
    print(result_metirc)
Ejemplo n.º 5
0
def main():
    args = parse_args()
    cfg_path = args.config
    cfg = Config.fromfile(cfg_path)

    # set pretrained model None
    cfg.model.pretrained = None

    # build postprocess
    postprocess = build_postprocess(cfg.postprocess)
    # for rec cal head number
    if hasattr(postprocess, 'character'):
        char_num = len(getattr(postprocess, 'character'))
        cfg.model.head.n_class = char_num

    eval_dataset = build_dataset(cfg.test_data.dataset)
    eval_loader = build_dataloader(eval_dataset,
                                   loader_cfg=cfg.test_data.loader)
    # build metric
    metric = build_metrics(cfg.metric)

    mode = args.mode
    if mode == 'torch':
        # build model
        model = build_model(cfg.model)
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        load_checkpoint(model, args.model_path, map_location=device)
        if args.simple:
            (filepath, filename) = os.path.split(args.model_path)
            simple_model_path = os.path.join(filepath,
                                             'sim_{}'.format(filename))
            save_checkpoint(model, simple_model_path)

        model = model.to(device)
        model.device = device
        result_metirc = eval(model, eval_loader, postprocess, metric)

    elif mode == 'engine':

        engine_path = args.engine_path
        model = TRTModel(engine_path)
        result_metirc = engine_eval(model, eval_loader, postprocess, metric)

    print(result_metirc)
Ejemplo n.º 6
0
def main():
    args = parse_args()
    cfg_path = args.config
    cfg = Config.fromfile(cfg_path)

    global_config = cfg.options  # 通用配置
    # local_rank = 0 is logger
    global_config['local_rank'] = args.local_rank
    # amp train
    if args.amp:
        global_config['is_amp'] = True
    else:
        global_config['is_amp'] = False

    # ema train
    if args.ema:
        global_config['is_ema'] = True
    else:
        global_config['is_ema'] = False

    # set cudnn_benchmark,如数据size一致能加快训练
    if global_config.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    if global_config.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        global_config.work_dir = osp.join(
            './work_dirs',
            osp.splitext(osp.basename(args.config))[0])

    # create work_dir
    file_util.mkdir_or_exist(global_config.work_dir)
    # init the logger before other steps
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    log_file = osp.join(global_config.work_dir, '{}.log'.format(timestamp))
    logger = get_logger(name='ocr', log_file=log_file)

    # # log env info
    if args.local_rank == 0:
        env_info_dict = collect_env()
        env_info = '\n'.join([('{}: {}'.format(k, v))
                              for k, v in env_info_dict.items()])
        dash_line = '-' * 60 + '\n'
        logger.info('Environment info:\n' + dash_line + env_info + '\n' +
                    dash_line)
        ## log some basic info
        logger.info('Config:\n{}'.format(cfg.text))
        # set random seeds
        logger.info('Set random seed to {}, deterministic: {}'.format(
            global_config.seed, args.deterministic))

    # set random seed
    set_random_seed(global_config.seed, deterministic=args.deterministic)

    # select device
    # dist init
    if torch.cuda.device_count() > 1 and args.distributed:
        device = init_dist(launcher='pytorch',
                           backend='nccl',
                           rank=args.local_rank)
        global_config['distributed'] = True
    else:
        device, gpu_ids = select_device(global_config.gpu_ids)
        global_config.gpu_ids = gpu_ids
        global_config['distributed'] = False

    # build train dataset
    train_dataset = build_dataset(cfg.train_data.dataset)
    train_loader = build_dataloader(train_dataset,
                                    loader_cfg=cfg.train_data.loader,
                                    distributed=global_config['distributed'])

    # if is eval , build eval dataloader,postprocess,metric
    # 移动到前面,由于rec-head的输出需要用postprocess计算
    if global_config.is_eval:
        eval_dataset = build_dataset(cfg.test_data.dataset)
        eval_loader = build_dataloader(
            eval_dataset,
            loader_cfg=cfg.test_data.loader,
            distributed=global_config['distributed'])
        # build postprocess
        postprocess = build_postprocess(cfg.postprocess)
        # build metric
        metric = build_metrics(cfg.metric)
    else:
        eval_loader = None
        postprocess = None
        metric = None

    # for rec cal head number
    if hasattr(postprocess, 'character'):
        char_num = len(getattr(postprocess, 'character'))
        cfg.model.head.n_class = char_num

    # build model
    model = build_model(cfg.model)
    model = model.to(device)

    # set model to device
    if device.type != 'cpu' and torch.cuda.device_count(
    ) > 1 and global_config['distributed'] == True:
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank)
        device = torch.device('cuda', args.local_rank)
        is_cuda = True
    elif device.type != 'cpu' and global_config[
            'distributed'] == False and len(gpu_ids) >= 1:
        model = nn.DataParallel(model, device_ids=global_config.gpu_ids)
        model.gpu_ids = gpu_ids
        is_cuda = True
    else:
        is_cuda = False

    global_config['is_cuda'] = is_cuda

    model.device = device

    # build optimizer
    optimizer = build_optimizer(cfg.optimizer, model)
    # build lr_scheduler
    lr_scheduler = build_lr_scheduler(cfg.lr_scheduler, optimizer)
    # build loss
    criterion = build_loss(cfg.loss).to(device)

    runner = TrainRunner(global_config, model, optimizer, lr_scheduler,
                         postprocess, criterion, train_loader, eval_loader,
                         metric, logger)

    # # Resume
    if global_config.resume_from is not None and args.resume:
        runner.resume(global_config.resume_from, map_location=device)

    if global_config.load_from is not None:
        runner.load_checkpoint(global_config.load_from, map_location=device)

    runner.run()
Ejemplo n.º 7
0
def main():
    # ===> 获取配置文件参数
    parser = argparse.ArgumentParser(description='train')
    parser.add_argument('--config',
                        type=str,
                        default='config/det.json',
                        help='train config file path')
    cfg = parser.parse_args()
    with open(cfg.config) as fin:
        cfg = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))

    # cfg = parse_args()
    os.makedirs(cfg.train_options.checkpoint_save_dir, exist_ok=True)
    logger = get_logger('torchocr',
                        log_file=os.path.join(
                            cfg.train_options.checkpoint_save_dir,
                            'train.log'))

    # ===> 训练信息的打印
    train_options = cfg.train_options
    logger.info(cfg)
    # ===>
    to_use_device = torch.device(
        train_options.device if torch.cuda.is_available() and (
            'cuda' in train_options.device) else 'cpu')
    # set_random_seed(cfg.SEED, 'cuda' in train_options.device, deterministic=True)

    # ===> build network
    net = build_model(cfg.model)

    # ===> 模型初始化及模型部署到对应的设备
    # net.apply(weight_init) # 使用 pretrained时,注释掉这句话
    # if torch.cuda.device_count() > 1:
    net = nn.DataParallel(net)
    net = net.to(to_use_device)
    net.train()

    # ===> get fine tune layers
    params_to_train = get_fine_tune_params(net, train_options.fine_tune_stage)
    # ===> solver and lr scheduler
    optimizer = build_optimizer(net.parameters(), cfg.optimizer)

    # ===> whether to resume from checkpoint
    resume_from = train_options.resume_from
    if resume_from:
        net, _resumed_optimizer, global_state = load_checkpoint(
            net,
            resume_from,
            to_use_device,
            optimizer,
            third_name=train_options.third_party_name)
        if _resumed_optimizer:
            optimizer = _resumed_optimizer
        logger.info(f'net resume from {resume_from}')
    else:
        global_state = {}
        logger.info(f'net resume from scratch.')

    # ===> loss function
    loss_func = build_loss(cfg.loss)
    loss_func = loss_func.to(to_use_device)

    # ===> data loader
    train_loader = build_dataloader(cfg.dataset.train)
    eval_loader = build_dataloader(cfg.dataset.eval)

    # post_process
    post_process = build_post_process(cfg.post_process)
    # ===> train
    train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device,
          cfg, global_state, logger, post_process)
Ejemplo n.º 8
0
model_dict = net.state_dict()
ckpt = torch.load(resume_from, map_location='cpu')
pretrained_dict = ckpt['state_dict']

# txt_file = os.path.join('test_results/', 'pretrainedmodel_state.txt')
# txt_f = open(txt_file, 'w')

# txt_f.write('############## Model Dict ################' + '\n')
# for j in model_dict:
#     txt_f.write(str(j) +'\n' )
# # txt_f.write('############## Pretrained Dict ################' + '\n')
# # for k in pretrained_dict:
# #     txt_f.write(str(k) +'\n')

# txt_f.close()

net, _, global_state = load_checkpoint(net, resume_from, to_use_device, _optimizers=None, third_name=train_options['third_party_name'])

# ===> loss function
loss_func = build_loss(cfg['loss'])
loss_func = loss_func.to(to_use_device)

# ===> data loader
train_loader = build_dataloader(cfg.dataset.train)
eval_loader = build_dataloader(cfg.dataset.eval)

# post_process
post_process = build_post_process(cfg['post_process'])
# ===> train
train(net, optimizer, loss_func, train_loader, eval_loader, to_use_device, cfg, global_state, logger, post_process)
Ejemplo n.º 9
0
def main():
    # ===> 获取配置文件参数
    # cfg = parse_args()
    parser = argparse.ArgumentParser(description='train')
    parser.add_argument('-c', '--config', type=str, default='config/recrbt3.json', help='train config file path')
    args = parser.parse_args()
    with open(args.config) as fin:
        cfg = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))

    os.makedirs(cfg.train_options.checkpoint_save_dir, exist_ok=True)
    logger = get_logger('torchocr', log_file=os.path.join(cfg.train_options.checkpoint_save_dir, 'train.log'))

    # ===> 训练信息的打印
    train_options = cfg.train_options
    logger.info(cfg)
    # ===>
    # to_use_device = torch.device(
    #     train_options.device if torch.cuda.is_available() and ('cuda' in train_options.device) else 'cpu')
    # set_random_seed(cfg.SEED, 'cuda' in train_options.device, deterministic=True)

    # ===> build network
    net = build_model(cfg.model)

    # ===> 模型初始化及模型部署到对应的设备
    # net.apply(weight_init)
    # # if torch.cuda.device_count() > 1:
    # net = nn.DataParallel(net)
    # net = net.to(to_use_device)
    net.train()

    # ===> get fine tune layers
    # params_to_train = get_fine_tune_params(net, train_options.fine_tune_stage)
    # ===> solver and lr scheduler
    optimizer = build_optimizer(net.parameters(), cfg.optimizer)
    scheduler = build_scheduler(optimizer, cfg.lr_scheduler)

    # ===> whether to resume from checkpoint
    resume_from = train_options.resume_from
    if resume_from:
        net, _resumed_optimizer, global_state = load_checkpoint(net, resume_from, 0, optimizer,
                                                                 third_name=train_options.third_party_name)
        if _resumed_optimizer:
            optimizer = _resumed_optimizer
        logger.info(f'net resume from {resume_from}')
    else:
        global_state = {}
        logger.info(f'net resume from scratch.')

    # ===> loss function
    loss_func = build_loss(cfg.loss)
    #loss_func = loss_func.to(to_use_device)

    if "dogclass.txt" in cfg.dataset.alphabet:
        with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file:
            cfg.dataset.alphabet = [s.strip('\n') for s in file.readlines()]
    else:
        with open(cfg.dataset.alphabet, 'r', encoding='utf-8') as file:
            cfg.dataset.alphabet = ''.join([s.strip('\n') for s in file.readlines()])

    # ===> data loader
    cfg.dataset.train.dataset.alphabet = cfg.dataset.alphabet
    train_loader = build_dataloader(cfg.dataset.train)
    cfg.dataset.eval.dataset.alphabet = cfg.dataset.alphabet
    eval_loader = build_dataloader(cfg.dataset.eval)

    # ===> train
    train(net, optimizer, scheduler, loss_func, train_loader, eval_loader, 0, cfg, global_state, logger)