def train(cfg, network): # cfg.train.num_workers = 0 if cfg.train.dataset[:4] != 'City': torch.multiprocessing.set_sharing_strategy('file_system') train_loader = make_data_loader(cfg, is_train=True, max_iter=cfg.ep_iter) val_loader = make_data_loader(cfg, is_train=False) # train_loader = make_data_loader(cfg, is_train=True, max_iter=100) trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) evaluator = make_evaluator(cfg) begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume) # set_lr_scheduler(cfg, scheduler) for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch trainer.train(epoch, train_loader, optimizer, recorder) scheduler.step() if (epoch + 1) % cfg.save_ep == 0: save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir) if (epoch + 1) % cfg.eval_ep == 0: trainer.val(epoch, val_loader, evaluator, recorder) return network
def train(cfg, network): trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) evaluator = make_evaluator(cfg) begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume) # set_lr_scheduler(cfg, scheduler) train_loader = make_data_loader(cfg, is_train=True) val_loader = make_data_loader(cfg, is_train=False) for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch trainer.train(epoch, train_loader, optimizer, recorder) scheduler.step() if (epoch + 1) % cfg.save_ep == 0: save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir) if (epoch + 1) % cfg.eval_ep == 0: trainer.val(epoch, val_loader, evaluator, recorder) return network
def train(cfg, network): if cfg.train.dataset[:4] != 'City': torch.multiprocessing.set_sharing_strategy('file_system') trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) if 'Coco' not in cfg.train.dataset: evaluator = make_evaluator(cfg) begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume) # set_lr_scheduler(cfg, scheduler) train_loader = make_data_loader(cfg, is_train=True) val_loader = make_data_loader(cfg, is_train=False) # train_loader = make_data_loader(cfg, is_train=True, max_iter=100) global_steps = None if cfg.neptune: global_steps = { 'train_global_steps': 0, 'valid_global_steps': 0, } neptune.init('hccccccccc/clean-pvnet') neptune.create_experiment(cfg.model_dir.split('/')[-1]) neptune.append_tag('pose') for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch trainer.train(epoch, train_loader, optimizer, recorder, global_steps) scheduler.step() if (epoch + 1) % cfg.save_ep == 0: save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir) if (epoch + 1) % cfg.eval_ep == 0: if 'Coco' in cfg.train.dataset: trainer.val_coco(val_loader, global_steps) else: trainer.val(epoch, val_loader, evaluator, recorder) if cfg.neptune: neptune.stop() return network
def train(cfg, network): trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) evaluator = make_evaluator(cfg) begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume) # begin_epoch = 0 #如果要继续训练那么请注释这一行 # set_lr_scheduler(cfg, scheduler) # print("before train loader") train_loader = make_data_loader(cfg, is_train=True) #到这里才读取的数据 # print("under train loader") val_loader = make_data_loader(cfg, is_train=False) # #这里是查看train_loader的相关参数个结构 # tmp_file = open('/home/tianhao.lu/code/Deep_snake/snake/Result/Contour/contour.log', 'w') # tmp_file.writelines("train_loader type:" + str(type(train_loader)) + "\n") # tmp_file.writelines("train_loader len:" + str(len(train_loader)) + "\n") # tmp_file.writelines("train_loader data:" + str(train_loader) + "\n") # for tmp_data in train_loader: # tmp_file.writelines("one train_loader data type:" + str(type(tmp_data)) + "\n") # for key in tmp_data: # tmp_file.writelines("one train_loader data key:" + str(key) + "\n") # tmp_file.writelines("one train_loader data len:" + str(len(tmp_data[key])) + "\n") # # tmp_file.writelines("one train_loader data:" + str(tmp_data) + "\n") # break # tmp_file.writelines(str("*************************************************************** \n")) # tmp_file.close() for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch trainer.train(epoch, train_loader, optimizer, recorder) scheduler.step() #optimizer.step()模型才会更新,scheduler.step()是用来调整lr的 if (epoch + 1) % cfg.save_ep == 0: save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir) if (epoch + 1) % cfg.eval_ep == 0: trainer.val(epoch, val_loader, evaluator, recorder) return network
def train(cfg, network): trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) evaluator = make_evaluator(cfg) begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.trained_model_dir, resume=cfg.resume) set_lr_scheduler(cfg, scheduler) train_loader = make_data_loader(cfg, is_train=True, is_distributed=cfg.distributed, max_iter=cfg.ep_iter) val_loader = make_data_loader(cfg, is_train=False) for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch if cfg.distributed: train_loader.batch_sampler.sampler.set_epoch(epoch) trainer.train(epoch, train_loader, optimizer, recorder) scheduler.step() if (epoch + 1) % cfg.save_ep == 0 and cfg.local_rank == 0: save_model(network, optimizer, scheduler, recorder, cfg.trained_model_dir, epoch) if (epoch + 1) % cfg.save_latest_ep == 0 and cfg.local_rank == 0: save_model(network, optimizer, scheduler, recorder, cfg.trained_model_dir, epoch, last=True) if (epoch + 1) % cfg.eval_ep == 0: trainer.val(epoch, val_loader, evaluator, recorder) return network
def train(cfg, network): trainer = make_trainer(cfg, network) optimizer = make_optimizer(cfg, network) scheduler = make_lr_scheduler(cfg, optimizer) recorder = make_recorder(cfg) #evaluator = make_evaluator(cfg) if cfg.network_full_init: begin_epoch = load_network(network, cfg.model_dir, resume=cfg.resume, epoch=cfg.test.epoch) begin_epoch = 0 else: begin_epoch = load_model(network, optimizer, scheduler, recorder, cfg.model_dir, resume=cfg.resume) # set_lr_scheduler(cfg, scheduler) if DEBUG: print('------------------Loading training set-------------------') train_loader = make_data_loader(cfg, is_train=True) if DEBUG: print('Loading training set done...') print('---------------------------------------------------------') #val_loader = make_data_loader(cfg, is_train=False) for epoch in range(begin_epoch, cfg.train.epoch): recorder.epoch = epoch trainer.train(epoch, train_loader, optimizer, recorder) scheduler.step() if (epoch + 1) % cfg.save_ep == 0: save_model(network, optimizer, scheduler, recorder, epoch, cfg.model_dir) #if (epoch + 1) % cfg.eval_ep == 0: # trainer.val(epoch, val_loader, evaluator, recorder) return network