def train(cfg): logger = setup_logger(name='Train', level=cfg.LOGGER.LEVEL) logger.info(cfg) model = build_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) #model.to(cuda_device = 'cuda:9') criterion = build_loss(cfg) optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) train_loader = build_data(cfg, is_train=True) val_loader = build_data(cfg, is_train=False) logger.info(train_loader.dataset) logger.info(val_loader.dataset) arguments = dict() arguments["iteration"] = 0 checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD checkpointer = Checkpointer(model, optimizer, scheduler, cfg.SAVE_DIR) do_train(cfg, model, train_loader, val_loader, optimizer, scheduler, criterion, checkpointer, device, checkpoint_period, arguments, logger)
def main(): # 读取配置文件 with open('config/default.yml') as fin: config = yaml.load(fin, Loader=yaml.SafeLoader) # 生成 train 和 valid 数据集 train_config = config['dataset']['train'] train_df = pd.read_csv(train_config['data_path'], sep='\t') train_df.sample(frac=1) train, valid = train_test_split(train_df, test_size=config['train_valid_split']) train_dataset = build_dataloader(train, train_config, device=device) valid_dataset = build_dataloader(valid, train_config, device=device) # 建立模型 model_config = config['model'] model = BertClassifier(model_config) model.to(device) optimizer = build_optimizer(model, config['optimizer']) # 计算训练步数 num_train_steps = int( len(train_dataset) / train_dataset.batch_size * config['num_epochs']) num_warmup_steps = int(num_train_steps * config['optimizer']['warmup_proportion']) scheduler = build_scheduler(optimizer, num_train_steps, num_warmup_steps) # 训练 trainer.do_train(model, train_loader=train_dataset, valid_loader=valid_dataset, optimizer=optimizer, scheduler=scheduler, cfg=config)
def train(cfg, local_rank, distributed): num_classes = COCODataset(cfg.data.train[0], cfg.data.train[1]).num_classes model = EfficientDet(num_classes=num_classes, model_name=cfg.model.name) inp_size = model.config['inp_size'] device = torch.device(cfg.device) model.to(device) optimizer = build_optimizer(model, **optimizer_kwargs(cfg)) lr_scheduler = build_lr_scheduler(optimizer, **lr_scheduler_kwargs(cfg)) use_mixed_precision = cfg.dtype == "float16" amp_opt_level = 'O1' if use_mixed_precision else 'O0' model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level) if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, # this should be removed if we update BatchNorm stats broadcast_buffers=False, find_unused_parameters=True) arguments = {} arguments["iteration"] = 0 output_dir = cfg.output_dir save_to_disk = comm.get_rank() == 0 checkpointer = Checkpointer(model, optimizer, lr_scheduler, output_dir, save_to_disk) extra_checkpoint_data = checkpointer.load(cfg.model.resume) arguments.update(extra_checkpoint_data) train_dataloader = build_dataloader(cfg, inp_size, is_train=True, distributed=distributed, start_iter=arguments["iteration"]) test_period = cfg.test.test_period if test_period > 0: val_dataloader = build_dataloader(cfg, inp_size, is_train=False, distributed=distributed) else: val_dataloader = None checkpoint_period = cfg.solver.checkpoint_period log_period = cfg.solver.log_period do_train(cfg, model, train_dataloader, val_dataloader, optimizer, lr_scheduler, checkpointer, device, checkpoint_period, test_period, log_period, arguments) return model
def train_net(cfg, logger, is_distributed=False, local_rank=0): model = build_model(cfg) p_sum = 0 for p in model.named_parameters(): logger.info('%s, %s, %s' % (p[0], p[1].shape, p[1].requires_grad)) if p[1].requires_grad: p_sum += p[1].numel() logger.info('model learnabel parameters: %d\n' % p_sum) device = torch.device(cfg.MODEL.DEVICE) model = model.to(device) if is_distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False) optimizer = build_optimizer(cfg, model) lr_scheduler = build_scheduler(cfg, optimizer) arguments = {} arguments["epoch"] = 0 checkpointer = CheckPointer( cfg=cfg, logger=logger, model=model, optimizer=optimizer, scheduler=lr_scheduler, save_dir=cfg.OUTPUT_DIR, #is_distributed=is_distributed, save_to_disk=get_rank() == 0) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) criterion = build_loss(cfg) train_loader = build_data_loader(cfg, True, is_distributed=is_distributed) model = do_train(cfg=cfg, model=model, device=device, train_loader=train_loader, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, checkpointer=checkpointer, arguments=arguments, logger=logger) return model
def __init__(self, *args, **kwargs): super(VariationalOptimization, self).__init__(*args, **kwargs) # Initialise mean and sd if self.cfg.MODEL.POLICY.NETWORK: # Set a feedforward network for means self.mean = FeedForward( self.state_dim, self.cfg.MODEL.POLICY.LAYERS, self.action_dim ) else: # Set tensors for means self.mean = Parameter(torch.from_numpy( self.initialise_mean(self.cfg.MODEL.POLICY.INITIAL_ACTION_MEAN, self.cfg.MODEL.POLICY.INITIAL_ACTION_SD) )) self.register_parameter("mean", self.mean) # Set tensors for standard deviations self.sd = Parameter(torch.from_numpy(self.initialise_sd(self.cfg.MODEL.POLICY.INITIAL_SD))) self.initial_clamped_sd = self.clamp_sd(self.sd.detach()) self.register_parameter("sd", self.sd) #self.clamped_sd = np.zeros((self.action_dim, self.horizon), dtype=np.float64) self.clamped_action = np.zeros((self.action_dim, self.horizon, self.batch_size), dtype=np.float64) # Initialise optimizer if self.method == "H": # Separate mean and sd optimizers (not sure if actually necessary) self.optimizer = {"mean": build_optimizer(self.cfg, self.get_named_parameters("mean")), "sd": build_optimizer(self.cfg, self.get_named_parameters("sd"))} self.best_actions = np.empty(self.sd.shape) self.best_actions.fill(np.nan) else: self.optimizer = build_optimizer(self.cfg, self.named_parameters()) # We need log probabilities for calculating REINFORCE loss self.log_prob = torch.empty(self.batch_size, self.horizon, dtype=torch.float64)
def train(setting_dict): logging.basicConfig(level=logging.INFO) logger = logging.getLogger("SSD.trainer") logger.setLevel(logging.INFO) logger.info("start training....") model = SSDdetector(setting_dict=setting_dict["model"]) ## if you want to fine tune the pretrained model ## just change "the path of pretrained model" to your model if setting_dict["fine_tune"] : checkpoint = torch.load(setting_dict["predtrained_model"], map_location=torch.device("cpu")) model_dict = {} for key, value in checkpoint.pop("model").items(): if "backbone" in key: model_dict[key.replace("backbone.","")] = value model.backbone.load_state_dict(model_dict) model.load_state_dict(checkpoint.pop("model")) for para in model.backbone.parameters() : para.requires_grad = False device = torch.device(setting_dict["device"]) model.to(device) lr = setting_dict["solver"]["LR"] ## if you want to fine tune the pretrained model ## just change model to model.boxhead optimizer = build_optimizer(setting_dict["solver"]["optimizer"], model, lr) scheduler = build_LRscheduler(setting_dict["solver"]["LRscheduler"])(optimizer, setting_dict["solver"]["LR_STEP"]) train_loader = make_dataLoader(setting_dict["train"], True) test_loader = make_dataLoader(setting_dict["test"], False) checkpointer = CheckPoint(model, optimizer, scheduler, "", logger) print(setting_dict["train_epoch"]) for i in range(1,setting_dict["train_epoch"] +1): do_train_one_epoch(model,train_loader,optimizer,scheduler,device,setting_dict["out_dir"], i) if i % 1 == 0 : do_evaluate(model, test_loader, device,setting_dict["out_dir"], i) if i % 7 == 0 : checkpointer.save(setting_dict["out_dir"]+"/v3_model_{:06d}".format(i)) checkpointer.save("finial") return model
def train(cfg): # prepare dataset train_loader, val_loader, test_loader, classes_list = make_data_loader( cfg, for_train=True) # build model and load parameter model = build_model(cfg) if cfg.SOLVER.SCHEDULER.RETRAIN_FROM_HEAD == True: if cfg.TRAIN.TRICK.PRETRAINED == True: model.load_param("Base", cfg.TRAIN.TRICK.PRETRAIN_PATH) else: if cfg.TRAIN.TRICK.PRETRAINED == True: model.load_param("Overall", cfg.TRAIN.TRICK.PRETRAIN_PATH) train_loader.dataset.batch_converter = model.backbone_batch_converter val_loader.dataset.batch_converter = model.backbone_batch_converter test_loader.dataset.batch_converter = model.backbone_batch_converter # build loss function loss_func, loss_class = build_loss(cfg) print('Train with losses:', cfg.LOSS.TYPE) # build optimizer (based on model) optimizer = build_optimizer(cfg, model, bias_free=cfg.MODEL.BIAS_FREE) #loss里也可能有参数 print("Model Bias-Free:{}".format(cfg.MODEL.BIAS_FREE)) print('Train with the optimizer type is', cfg.SOLVER.OPTIMIZER.NAME) # build scheduler (based on optimizer) scheduler, start_epoch = build_scheduler(cfg, optimizer) # build and launch engine for training do_train( cfg, model, train_loader, val_loader, classes_list, optimizer, scheduler, loss_func, start_epoch, )
def train(config, experiment_name=None): num_classes = config.MODEL.NUM_CLASSES # dataloader for training train_period = 'train' train_loader = build_dataloader(cfg=config, period=train_period, loader_type='train') val_loader = build_dataloader(cfg=config, period=train_period, loader_type='val') # prepare model model = build_model(cfg=config) print('The loss type is', config.MODEL.LOSS_TYPE) loss_func = build_loss(config, num_classes) optimizer = build_optimizer(config, model) # Add for using self trained model if config.MODEL.PRETRAIN_CHOICE == 'self': start_epoch = eval( config.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_') [-1]) print('Start epoch:', start_epoch) path_to_optimizer = config.MODEL.PRETRAIN_PATH.replace( 'model', 'optimizer') print('Path to the checkpoint of optimizer:', path_to_optimizer) model.load_state_dict(torch.load(config.MODEL.PRETRAIN_PATH)) optimizer.load_state_dict(torch.load(path_to_optimizer)) scheduler = WarmUpMultiStepLR(optimizer, config.SOLVER.STEPS, config.SOLVER.GAMMA, config.SOLVER.WARMUP_FACTOR, config.SOLVER.WARMUP_ITERS, config.SOLVER.WARMUP_METHOD) print('------------------ Start Training -------------------') do_train(config, model, train_loader, val_loader, optimizer, scheduler, loss_func, experiment_name) print('---------------- Training Completed ---------------- ')
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max, args.iw_clip).cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
def train(cfg_path, device='cuda'): if cfg_path is not None: cfg.merge_from_file(cfg_path) cfg.freeze() if not os.path.isdir(cfg.LOG_DIR): os.makedirs(cfg.LOG_DIR) if not os.path.isdir(cfg.SAVE_DIR): os.makedirs(cfg.SAVE_DIR) model = UNet(cfg.NUM_CHANNELS, cfg.NUM_CLASSES) model.to(device) train_data_loader = build_data_loader(cfg, 'train') if cfg.VAL: val_data_loader = build_data_loader(cfg, 'val') else: val_data_loader = None optimizer = build_optimizer(cfg, model) lr_scheduler = build_lr_scheduler(cfg, optimizer) criterion = get_loss_func(cfg) writer = SummaryWriter(cfg.LOG_DIR) iter_counter = 0 loss_meter = AverageMeter() val_loss_meter = AverageMeter() min_val_loss = 1e10 print('Training Start') for epoch in range(cfg.SOLVER.MAX_EPOCH): print('Epoch {}/{}'.format(epoch + 1, cfg.SOLVER.MAX_EPOCH)) if lr_scheduler is not None: lr_scheduler.step(epoch) for data in train_data_loader: iter_counter += 1 imgs, annots = data imgs = imgs.to(device) annots = annots.to(device) y = model(imgs) optimizer.zero_grad() loss = criterion(y, annots) loss.backward() optimizer.step() loss_meter.update(loss.item()) if iter_counter % 10 == 0: writer.add_scalars('loss', {'train': loss_meter.avg}, iter_counter) loss_meter.reset() if lr_scheduler is not None: writer.add_scalar('learning rate', optimizer.param_groups[0]['lr'], iter_counter) save_as_checkpoint(model, optimizer, os.path.join(cfg.SAVE_DIR, 'checkpoint.pth'), epoch, iter_counter) # Skip validation when cfg.VAL is False if val_data_loader is None: continue for data in val_data_loader: val_loss_meter.reset() with torch.no_grad(): imgs, annots = data imgs = imgs.to(device) annots = annots.to(device) y = model(imgs) optimizer.zero_grad() loss = criterion(y, annots) val_loss_meter.update(loss.item()) if val_loss_meter.avg < min_val_loss: min_val_loss = val_loss_meter.avg writer.add_scalars('loss', {'val': val_loss_meter.avg}, iter_counter) # save model if validation loss is minimum torch.save(model.state_dict(), os.path.join(cfg.SAVE_DIR, 'min_val_loss.pth'))
def train(is_dist, start_epoch, local_rank): transforms = transform.build_transforms() coco_dataset = dataset.COCODataset(is_train=True, transforms=transforms) if (is_dist): sampler = distributedGroupSampler(coco_dataset) else: sampler = groupSampler(coco_dataset) dataloader = build_dataloader(coco_dataset, sampler) batch_time_meter = utils.AverageMeter() cls_loss_meter = utils.AverageMeter() reg_loss_meter = utils.AverageMeter() losses_meter = utils.AverageMeter() model = retinanet(is_train=True) if (start_epoch == 1): model.resnet.load_pretrained(pretrained_path[cfg.resnet_depth]) else: utils.load_model(model, start_epoch - 1) model = model.cuda() if is_dist: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank, broadcast_buffers=False) optimizer = solver.build_optimizer(model) scheduler = solver.scheduler(optimizer) model.train() logs = [] for epoch in range(start_epoch, cfg.max_epochs + 1): if is_dist: dataloader.sampler.set_epoch(epoch - 1) scheduler.lr_decay(epoch) end_time = time.time() for iteration, datas in enumerate(dataloader, 1): scheduler.linear_warmup(epoch, iteration - 1) images = datas["images"] bboxes = datas["bboxes"] labels = datas["labels"] res_img_shape = datas["res_img_shape"] pad_img_shape = datas["pad_img_shape"] images = images.cuda() bboxes = [bbox.cuda() for bbox in bboxes] labels = [label.cuda() for label in labels] loss_dict = model(images, gt_bboxes=bboxes, gt_labels=labels, res_img_shape=res_img_shape, pad_img_shape=pad_img_shape) cls_loss = loss_dict["cls_loss"] reg_loss = loss_dict["reg_loss"] losses = cls_loss + reg_loss optimizer.zero_grad() losses.backward() optimizer.step() batch_time_meter.update(time.time() - end_time) end_time = time.time() cls_loss_meter.update(cls_loss.item()) reg_loss_meter.update(reg_loss.item()) losses_meter.update(losses.item()) if (iteration % 50 == 0): if (local_rank == 0): res = "\t".join([ "Epoch: [%d/%d]" % (epoch, cfg.max_epochs), "Iter: [%d/%d]" % (iteration, len(dataloader)), "Time: %.3f (%.3f)" % (batch_time_meter.val, batch_time_meter.avg), "cls_loss: %.4f (%.4f)" % (cls_loss_meter.val, cls_loss_meter.avg), "reg_loss: %.4f (%.4f)" % (reg_loss_meter.val, reg_loss_meter.avg), "Loss: %.4f (%.4f)" % (losses_meter.val, losses_meter.avg), "lr: %.6f" % (optimizer.param_groups[0]["lr"]), ]) print(res) logs.append(res) batch_time_meter.reset() cls_loss_meter.reset() reg_loss_meter.reset() losses_meter.reset() if (local_rank == 0): utils.save_model(model, epoch) if (is_dist): utils.synchronize() if (local_rank == 0): with open("logs.txt", "w") as f: for i in logs: f.write(i + "\n")
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Training GlamPoints detector') parser.add_argument('--path_ymlfile', type=str, default='configs/glampoints_training.yml', help='Path to yaml file.') opt = parser.parse_args() with open(opt.path_ymlfile, 'r') as ymlfile: cfg = yaml.load(ymlfile) _device = settings.initialize_cuda_and_logging(cfg) train_loader, val_loader = make_data_loader(cfg) model = build_model(cfg) model.to(_device) optimizer = build_optimizer(cfg, model) loss_func = build_loss(cfg) logger, tb_logger = build_logger(cfg) do_train(cfg, model, train_loader, val_loader, optimizer, loss_func, logger, tb_logger, _device)