def __init__(self, args, path): super(Solver, self).__init__() self.args = args self.path = path self.dev = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.to_pil = transforms.ToPILImage() self.exp_name = args["exp_name"] if "@" in self.exp_name: network_realname = self.exp_name.split("@")[0] else: network_realname = self.exp_name self.exp_name = self.exp_name.replace("@", "_") self.tr_data_path = self.args["rgb_data"]["tr_data_path"] self.te_data_path = self.args["rgb_data"]["te_data_path"] self.te_data_list = self.args["rgb_data"]["te_data_list"] self.save_path = self.path["save"] self.save_pre = self.args["save_pre"] if self.args["tb_update"] > 0: self.tb = SummaryWriter(self.path["tb"]) # 依赖与前面属性的属性 self.pth_path = self.path["final_state_net"] self.tr_loader = create_loader( data_path=self.tr_data_path, mode="train", get_length=False, size_list=self.args["size_list"], ) self.te_loader, self.te_length = create_loader( data_path=self.te_data_path, mode="test", get_length=True) if hasattr(network_lib, network_realname): self.net = getattr(network_lib, network_realname)().to(self.dev) else: raise AttributeError pprint(self.args) if self.args["resume"]: self.resume_checkpoint(load_path=self.path["final_full_net"], mode="all") else: self.start_epoch = 0 self.end_epoch = self.args["epoch_num"] self.only_test = self.start_epoch == self.end_epoch if not self.only_test: self.iter_num = self.end_epoch * len(self.tr_loader) self.opti = self.make_optim() self.sche = self.make_scheduler() # 损失函数 self.loss_funcs = [ BCELoss(reduction=self.args["reduction"]).to(self.dev) ] if self.args["use_aux_loss"]: self.loss_funcs.append(CEL().to(self.dev))
def __init__(self, args, path): super(Solver, self).__init__() self.args = args self.path = path self.dev = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.to_pil = transforms.ToPILImage() self.model_name = args[args['NET']]['exp_name'] self.tr_data_path = self.args['rgb_data']['tr_data_path'] self.te_data_path = self.args['rgb_data']['te_data_path'] self.te_data_list = self.args['rgb_data']['te_data_list'] self.save_path = self.path["save"] self.save_pre = self.args["save_pre"] if self.args["tb_update"] > 0: self.tb = SummaryWriter(self.path["tb"]) # 依赖与前面属性的属性 self.pth_path = self.path["final_state_net"] self.tr_loader = create_loader(data_path=self.tr_data_path, mode='train', get_length=False) self.te_loader, self.te_length = create_loader( data_path=self.te_data_path, mode='test', get_length=True) self.net = self.args[self.args["NET"]]["net"]().to(self.dev) pprint(self.args) if self.args['resume']: self.resume_checkpoint(load_path=self.path['final_full_net'], mode='all') else: self.start_epoch = 0 self.end_epoch = self.args["epoch_num"] self.only_test = self.start_epoch == self.end_epoch if not self.only_test: self.iter_num = self.end_epoch * len(self.tr_loader) self.opti = self.make_optim() self.sche = self.make_scheduler() # 损失函数 self.loss_funcs = [ BCELoss(reduction=self.args['reduction']).to(self.dev) ] if self.args['use_aux_loss']: self.loss_funcs.append(CEL().to(self.dev))
def __init__(self, exp_name: str, arg_dict: dict, path_dict: dict): super(Solver, self).__init__() self.exp_name = exp_name self.arg_dict = arg_dict self.path_dict = path_dict self.dev = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.to_pil = transforms.ToPILImage() self.tr_data_path = self.arg_dict["rgb_data"]["tr_data_path"] self.te_data_list = self.arg_dict["rgb_data"]["te_data_list"] self.save_path = self.path_dict["save"] self.save_pre = self.arg_dict["save_pre"] if self.arg_dict["tb_update"] > 0: self.tb_recorder = TBRecorder(tb_path=self.path_dict["tb"]) if self.arg_dict["xlsx_name"]: self.xlsx_recorder = XLSXRecoder(xlsx_path=self.path_dict["xlsx"]) # 依赖与前面属性的属性 self.tr_loader = create_loader( data_path=self.tr_data_path, training=True, size_list=self.arg_dict["size_list"], prefix=self.arg_dict["prefix"], get_length=False, ) self.end_epoch = self.arg_dict["epoch_num"] self.iter_num = self.end_epoch * len(self.tr_loader) if hasattr(network_lib, self.arg_dict["model"]): self.net = getattr(network_lib, self.arg_dict["model"])().to(self.dev) else: raise AttributeError pprint(self.arg_dict) if self.arg_dict["resume_mode"] == "test": # resume model only to test model. # self.start_epoch is useless resume_checkpoint( model=self.net, load_path=self.path_dict["final_state_net"], mode="onlynet", ) return self.loss_funcs = [ torch.nn.BCEWithLogitsLoss( reduction=self.arg_dict["reduction"]).to(self.dev) ] if self.arg_dict["use_aux_loss"]: self.loss_funcs.append(CEL().to(self.dev)) self.opti = make_optimizer( model=self.net, optimizer_type=self.arg_dict["optim"], optimizer_info=dict( lr=self.arg_dict["lr"], momentum=self.arg_dict["momentum"], weight_decay=self.arg_dict["weight_decay"], nesterov=self.arg_dict["nesterov"], ), ) self.sche = make_scheduler( optimizer=self.opti, total_num=self.iter_num if self.arg_dict["sche_usebatch"] else self.end_epoch, scheduler_type=self.arg_dict["lr_type"], scheduler_info=dict(lr_decay=self.arg_dict["lr_decay"], warmup_epoch=self.arg_dict["warmup_epoch"]), ) # AMP if self.arg_dict["use_amp"]: construct_print("Now, we will use the amp to accelerate training!") from apex import amp self.amp = amp self.net, self.opti = self.amp.initialize(self.net, self.opti, opt_level="O1") else: self.amp = None if self.arg_dict["resume_mode"] == "train": # resume model to train the model self.start_epoch = resume_checkpoint( model=self.net, optimizer=self.opti, scheduler=self.sche, amp=self.amp, exp_name=self.exp_name, load_path=self.path_dict["final_full_net"], mode="all", ) else: # only train a new model. self.start_epoch = 0
def main_worker(local_rank, ngpus_per_node, world_size): global total_iter_num, batch_size_single_gpu if local_rank == 0: construct_print(user_config) construct_print(f"Project Root: {user_config['proj_root']}") construct_print( f"Training on: {user_config['rgb_data']['tr_data_path']}") # https://github.com/tczhangzhi/pytorch-distributed/issues/4 init_seed(seed=0) init_cudnn(benchmark=(user_config["size_list"] == None)) if user_config["is_distributed"]: init_process(ip=args.ip, port=args.port, rank=local_rank, world_size=world_size) torch.cuda.set_device(local_rank) batch_size_single_gpu = user_config["batch_size"] // ngpus_per_node train_set = ImageFolder( root=user_config["rgb_data"]["tr_data_path"], in_size=user_config["input_size"], training=True, ) train_sampler = (data_dist.DistributedSampler(train_set) if user_config["is_distributed"] else None) tr_loader = create_loader( data_set=train_set, size_list=user_config["size_list"], batch_size=batch_size_single_gpu, shuffle=(train_sampler == None), num_workers=user_config["num_workers"], sampler=train_sampler, drop_last=True, pin_memory=True, ) total_iter_num = user_config["epoch_num"] * len(tr_loader) model = getattr(network_lib, user_config["model"])().cuda(local_rank) # 单独的测试部分 if user_config["resume_mode"] == "test": if local_rank == 0: # resume model only to test model. # start_epoch is useless resume_checkpoint( model=model, load_path=path_config["final_full_net"], mode="onlynet", ) test(model) construct_print("GPU:0 end testing...") else: construct_print("GPU:1 no testing...") return optimizer = make_optimizer( model=model, optimizer_type=user_config["optim"], optimizer_info=dict( lr=user_config["lr"], momentum=user_config["momentum"], weight_decay=user_config["weight_decay"], nesterov=user_config["nesterov"], ), ) scheduler = CustomScheduler( optimizer=optimizer, total_num=total_iter_num if user_config["sche_usebatch"] else user_config["epoch_num"], scheduler_type=user_config["lr_type"], scheduler_info=dict(lr_decay=user_config["lr_decay"], warmup_epoch=user_config["warmup_epoch"]), ) if local_rank == 0: construct_print(f"optimizer = {optimizer}") construct_print(f"scheduler = {scheduler}") if user_config["is_distributed"]: model = convert_syncbn_model(model) if user_config["use_amp"]: assert cudnn.enabled, "Amp requires cudnn backend to be enabled." model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if user_config["is_distributed"]: model = DDP(model, delay_allreduce=True) # 训练部分 if user_config["resume_mode"] == "train" and local_rank == 0: # resume model to train the model start_epoch = resume_checkpoint( model=model, optimizer=optimizer, amp=amp if user_config["use_amp"] else None, exp_name=exp_name, load_path=path_config["final_full_net"], mode="all", local_rank=local_rank, ) else: # only train a new model. start_epoch = 0 loss_funcs = [ BCEWithLogitsLoss(reduction=user_config["reduction"]).cuda(local_rank) ] if user_config["use_aux_loss"]: from loss.CEL import CEL loss_funcs.append(CEL().cuda(local_rank)) train( model=model, start_epoch=start_epoch, end_epoch=user_config["epoch_num"], tr_loader=tr_loader, optimizer=optimizer, scheduler=scheduler, loss_funcs=loss_funcs, train_sampler=train_sampler, local_rank=local_rank, ) construct_print("End Training...") if user_config["is_distributed"]: destroy_process()
def __init__(self, args, path): super(Solver, self).__init__() self.args = args self.path = path self.dev = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.to_pil = transforms.ToPILImage() self.exp_name = args["exp_name"] if "@" in self.exp_name: network_realname = self.exp_name.split("@")[0] else: network_realname = self.exp_name self.exp_name = self.exp_name.replace("@", "_") self.tr_data_path = self.args["rgb_data"]["tr_data_path"] self.te_data_list = self.args["rgb_data"]["te_data_list"] self.save_path = self.path["save"] self.save_pre = self.args["save_pre"] if self.args["tb_update"] > 0: self.tb = SummaryWriter(self.path["tb"]) # 依赖与前面属性的属性 self.pth_path = self.path["final_state_net"] self.tr_loader = create_loader( data_path=self.tr_data_path, mode="train", get_length=False, prefix=self.args["prefix"], size_list=self.args["size_list"], ) self.end_epoch = self.args["epoch_num"] self.iter_num = self.end_epoch * len(self.tr_loader) if hasattr(network_lib, network_realname): self.net = getattr(network_lib, network_realname)().to(self.dev) else: raise AttributeError pprint(self.args) self.opti = make_optimizer( model=self.net, optimizer_type=self.args["optim"], optimizer_info=dict( lr=self.args["lr"], momentum=self.args["momentum"], weight_decay=self.args["weight_decay"], nesterov=self.args["nesterov"], ), ) self.sche = make_scheduler( optimizer=self.opti, total_num=self.iter_num if self.args["sche_usebatch"] else self.end_epoch, scheduler_type=self.args["lr_type"], scheduler_info=dict(lr_decay=self.args["lr_decay"], warmup_epoch=self.args["warmup_epoch"]), ) if self.args["resume_mode"] == "train": # resume model to train the model self.start_epoch = resume_checkpoint( model=self.net, optimizer=self.opti, scheduler=self.sche, exp_name=self.exp_name, load_path=self.path["final_full_net"], mode="all", ) self.only_test = False elif self.args["resume_mode"] == "test": # resume model only to test model. # self.start_epoch is useless resume_checkpoint( model=self.net, load_path=self.pth_path, mode="onlynet", ) self.only_test = True elif not self.args["resume_mode"]: # only train a new model. self.start_epoch = 0 self.only_test = False else: raise NotImplementedError if not self.only_test: # 损失函数 self.loss_funcs = [ BCELoss(reduction=self.args["reduction"]).to(self.dev) ] if self.args["use_aux_loss"]: self.loss_funcs.append(CEL().to(self.dev))