Esempio n. 1
0
def train(cfg, network):
#     cfg.train.num_workers = 0
    if cfg.train.dataset[:4] != 'City':
        torch.multiprocessing.set_sharing_strategy('file_system')
        
    train_loader = make_data_loader(cfg, is_train=True, max_iter=cfg.ep_iter)
    val_loader = make_data_loader(cfg, is_train=False)
    # train_loader = make_data_loader(cfg, is_train=True, max_iter=100)
    
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    evaluator = make_evaluator(cfg)

    begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume)
    # set_lr_scheduler(cfg, scheduler)

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        trainer.train(epoch, train_loader, optimizer, recorder)
        scheduler.step()

        if (epoch + 1) % cfg.save_ep == 0:
            save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir)

        if (epoch + 1) % cfg.eval_ep == 0:
            trainer.val(epoch, val_loader, evaluator, recorder)

    return network
Esempio n. 2
0
def train(cfg, network):
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    evaluator = make_evaluator(cfg)

    begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume)
    # set_lr_scheduler(cfg, scheduler)

    train_loader = make_data_loader(cfg, is_train=True)
    val_loader = make_data_loader(cfg, is_train=False)

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        trainer.train(epoch, train_loader, optimizer, recorder)
        scheduler.step()

        if (epoch + 1) % cfg.save_ep == 0:
            save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir)

        if (epoch + 1) % cfg.eval_ep == 0:
            trainer.val(epoch, val_loader, evaluator, recorder)

    return network
Esempio n. 3
0
def train(cfg, network):
    if cfg.train.dataset[:4] != 'City':
        torch.multiprocessing.set_sharing_strategy('file_system')
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    if 'Coco' not in cfg.train.dataset:
        evaluator = make_evaluator(cfg)

    begin_epoch = load_model(network,
                             optimizer,
                             scheduler,
                             recorder,
                             cfg.model_dir,
                             resume=cfg.resume)
    # set_lr_scheduler(cfg, scheduler)

    train_loader = make_data_loader(cfg, is_train=True)
    val_loader = make_data_loader(cfg, is_train=False)
    # train_loader = make_data_loader(cfg, is_train=True, max_iter=100)

    global_steps = None
    if cfg.neptune:
        global_steps = {
            'train_global_steps': 0,
            'valid_global_steps': 0,
        }

        neptune.init('hccccccccc/clean-pvnet')
        neptune.create_experiment(cfg.model_dir.split('/')[-1])
        neptune.append_tag('pose')

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        trainer.train(epoch, train_loader, optimizer, recorder, global_steps)
        scheduler.step()

        if (epoch + 1) % cfg.save_ep == 0:
            save_model(network, optimizer, scheduler, recorder, epoch,
                       cfg.model_dir)

        if (epoch + 1) % cfg.eval_ep == 0:
            if 'Coco' in cfg.train.dataset:
                trainer.val_coco(val_loader, global_steps)
            else:
                trainer.val(epoch, val_loader, evaluator, recorder)

    if cfg.neptune:
        neptune.stop()

    return network
Esempio n. 4
0
def train(cfg, network):
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    evaluator = make_evaluator(cfg)

    begin_epoch = load_model(network,
                             optimizer,
                             scheduler,
                             recorder,
                             cfg.model_dir,
                             resume=cfg.resume)
    # begin_epoch = 0  #如果要继续训练那么请注释这一行
    # set_lr_scheduler(cfg, scheduler)
    # print("before train loader")
    train_loader = make_data_loader(cfg, is_train=True)  #到这里才读取的数据
    # print("under train loader")
    val_loader = make_data_loader(cfg, is_train=False)

    # #这里是查看train_loader的相关参数个结构
    # tmp_file = open('/home/tianhao.lu/code/Deep_snake/snake/Result/Contour/contour.log', 'w')
    # tmp_file.writelines("train_loader type:" + str(type(train_loader)) + "\n")
    # tmp_file.writelines("train_loader len:" + str(len(train_loader)) + "\n")
    # tmp_file.writelines("train_loader data:" + str(train_loader) + "\n")
    # for tmp_data in train_loader:
    #     tmp_file.writelines("one train_loader data type:" + str(type(tmp_data)) + "\n")
    #     for key in tmp_data:
    #         tmp_file.writelines("one train_loader data key:" + str(key) + "\n")
    #         tmp_file.writelines("one train_loader data len:" + str(len(tmp_data[key])) + "\n")
    #     # tmp_file.writelines("one train_loader data:" + str(tmp_data) + "\n")
    #     break
    # tmp_file.writelines(str("*************************************************************** \n"))
    # tmp_file.close()

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        trainer.train(epoch, train_loader, optimizer, recorder)
        scheduler.step()  #optimizer.step()模型才会更新,scheduler.step()是用来调整lr的

        if (epoch + 1) % cfg.save_ep == 0:
            save_model(network, optimizer, scheduler, recorder, epoch,
                       cfg.model_dir)

        if (epoch + 1) % cfg.eval_ep == 0:
            trainer.val(epoch, val_loader, evaluator, recorder)

    return network
Esempio n. 5
0
def train(cfg, network):
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    evaluator = make_evaluator(cfg)

    begin_epoch = load_model(network,
                             optimizer,
                             scheduler,
                             recorder,
                             cfg.trained_model_dir,
                             resume=cfg.resume)
    set_lr_scheduler(cfg, scheduler)

    train_loader = make_data_loader(cfg,
                                    is_train=True,
                                    is_distributed=cfg.distributed,
                                    max_iter=cfg.ep_iter)
    val_loader = make_data_loader(cfg, is_train=False)

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        if cfg.distributed:
            train_loader.batch_sampler.sampler.set_epoch(epoch)

        trainer.train(epoch, train_loader, optimizer, recorder)
        scheduler.step()

        if (epoch + 1) % cfg.save_ep == 0 and cfg.local_rank == 0:
            save_model(network, optimizer, scheduler, recorder,
                       cfg.trained_model_dir, epoch)

        if (epoch + 1) % cfg.save_latest_ep == 0 and cfg.local_rank == 0:
            save_model(network,
                       optimizer,
                       scheduler,
                       recorder,
                       cfg.trained_model_dir,
                       epoch,
                       last=True)

        if (epoch + 1) % cfg.eval_ep == 0:
            trainer.val(epoch, val_loader, evaluator, recorder)

    return network
Esempio n. 6
0
def train(cfg, network):
    trainer = make_trainer(cfg, network)
    optimizer = make_optimizer(cfg, network)
    scheduler = make_lr_scheduler(cfg, optimizer)
    recorder = make_recorder(cfg)
    #evaluator = make_evaluator(cfg)

    if cfg.network_full_init:
        begin_epoch = load_network(network,
                                   cfg.model_dir,
                                   resume=cfg.resume,
                                   epoch=cfg.test.epoch)
        begin_epoch = 0
    else:
        begin_epoch = load_model(network,
                                 optimizer,
                                 scheduler,
                                 recorder,
                                 cfg.model_dir,
                                 resume=cfg.resume)

    # set_lr_scheduler(cfg, scheduler)
    if DEBUG:
        print('------------------Loading training set-------------------')
    train_loader = make_data_loader(cfg, is_train=True)
    if DEBUG:
        print('Loading training set done...')
        print('---------------------------------------------------------')
    #val_loader = make_data_loader(cfg, is_train=False)

    for epoch in range(begin_epoch, cfg.train.epoch):
        recorder.epoch = epoch
        trainer.train(epoch, train_loader, optimizer, recorder)
        scheduler.step()

        if (epoch + 1) % cfg.save_ep == 0:
            save_model(network, optimizer, scheduler, recorder, epoch,
                       cfg.model_dir)

        #if (epoch + 1) % cfg.eval_ep == 0:
        #    trainer.val(epoch, val_loader, evaluator, recorder)

    return network