Ejemplo n.º 1
0
    def __init__(self, args, path):
        super(Solver, self).__init__()
        self.args = args
        self.path = path
        self.dev = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.to_pil = transforms.ToPILImage()
        self.exp_name = args["exp_name"]
        if "@" in self.exp_name:
            network_realname = self.exp_name.split("@")[0]
        else:
            network_realname = self.exp_name
        self.exp_name = self.exp_name.replace("@", "_")

        self.tr_data_path = self.args["rgb_data"]["tr_data_path"]
        self.te_data_path = self.args["rgb_data"]["te_data_path"]
        self.te_data_list = self.args["rgb_data"]["te_data_list"]

        self.save_path = self.path["save"]
        self.save_pre = self.args["save_pre"]
        if self.args["tb_update"] > 0:
            self.tb = SummaryWriter(self.path["tb"])

        # 依赖与前面属性的属性
        self.pth_path = self.path["final_state_net"]
        self.tr_loader = create_loader(
            data_path=self.tr_data_path,
            mode="train",
            get_length=False,
            size_list=self.args["size_list"],
        )
        self.te_loader, self.te_length = create_loader(
            data_path=self.te_data_path, mode="test", get_length=True)

        if hasattr(network_lib, network_realname):
            self.net = getattr(network_lib, network_realname)().to(self.dev)
        else:
            raise AttributeError
        pprint(self.args)

        if self.args["resume"]:
            self.resume_checkpoint(load_path=self.path["final_full_net"],
                                   mode="all")
        else:
            self.start_epoch = 0
        self.end_epoch = self.args["epoch_num"]
        self.only_test = self.start_epoch == self.end_epoch

        if not self.only_test:
            self.iter_num = self.end_epoch * len(self.tr_loader)
            self.opti = self.make_optim()
            self.sche = self.make_scheduler()

            # 损失函数
            self.loss_funcs = [
                BCELoss(reduction=self.args["reduction"]).to(self.dev)
            ]
            if self.args["use_aux_loss"]:
                self.loss_funcs.append(CEL().to(self.dev))
Ejemplo n.º 2
0
    def __init__(self, args, path):
        super(Solver, self).__init__()
        self.args = args
        self.path = path
        self.dev = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.to_pil = transforms.ToPILImage()
        self.model_name = args[args['NET']]['exp_name']

        self.tr_data_path = self.args['rgb_data']['tr_data_path']
        self.te_data_path = self.args['rgb_data']['te_data_path']
        self.te_data_list = self.args['rgb_data']['te_data_list']

        self.save_path = self.path["save"]
        self.save_pre = self.args["save_pre"]
        if self.args["tb_update"] > 0:
            self.tb = SummaryWriter(self.path["tb"])

        # 依赖与前面属性的属性
        self.pth_path = self.path["final_state_net"]
        self.tr_loader = create_loader(data_path=self.tr_data_path,
                                       mode='train',
                                       get_length=False)
        self.te_loader, self.te_length = create_loader(
            data_path=self.te_data_path, mode='test', get_length=True)

        self.net = self.args[self.args["NET"]]["net"]().to(self.dev)
        pprint(self.args)

        if self.args['resume']:
            self.resume_checkpoint(load_path=self.path['final_full_net'],
                                   mode='all')
        else:
            self.start_epoch = 0
        self.end_epoch = self.args["epoch_num"]
        self.only_test = self.start_epoch == self.end_epoch

        if not self.only_test:
            self.iter_num = self.end_epoch * len(self.tr_loader)
            self.opti = self.make_optim()
            self.sche = self.make_scheduler()

            # 损失函数
            self.loss_funcs = [
                BCELoss(reduction=self.args['reduction']).to(self.dev)
            ]
            if self.args['use_aux_loss']:
                self.loss_funcs.append(CEL().to(self.dev))
Ejemplo n.º 3
0
    def __init__(self, exp_name: str, arg_dict: dict, path_dict: dict):
        super(Solver, self).__init__()
        self.exp_name = exp_name
        self.arg_dict = arg_dict
        self.path_dict = path_dict

        self.dev = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.to_pil = transforms.ToPILImage()

        self.tr_data_path = self.arg_dict["rgb_data"]["tr_data_path"]
        self.te_data_list = self.arg_dict["rgb_data"]["te_data_list"]

        self.save_path = self.path_dict["save"]
        self.save_pre = self.arg_dict["save_pre"]

        if self.arg_dict["tb_update"] > 0:
            self.tb_recorder = TBRecorder(tb_path=self.path_dict["tb"])
        if self.arg_dict["xlsx_name"]:
            self.xlsx_recorder = XLSXRecoder(xlsx_path=self.path_dict["xlsx"])

        # 依赖与前面属性的属性
        self.tr_loader = create_loader(
            data_path=self.tr_data_path,
            training=True,
            size_list=self.arg_dict["size_list"],
            prefix=self.arg_dict["prefix"],
            get_length=False,
        )
        self.end_epoch = self.arg_dict["epoch_num"]
        self.iter_num = self.end_epoch * len(self.tr_loader)

        if hasattr(network_lib, self.arg_dict["model"]):
            self.net = getattr(network_lib,
                               self.arg_dict["model"])().to(self.dev)
        else:
            raise AttributeError
        pprint(self.arg_dict)

        if self.arg_dict["resume_mode"] == "test":
            # resume model only to test model.
            # self.start_epoch is useless
            resume_checkpoint(
                model=self.net,
                load_path=self.path_dict["final_state_net"],
                mode="onlynet",
            )
            return

        self.loss_funcs = [
            torch.nn.BCEWithLogitsLoss(
                reduction=self.arg_dict["reduction"]).to(self.dev)
        ]
        if self.arg_dict["use_aux_loss"]:
            self.loss_funcs.append(CEL().to(self.dev))

        self.opti = make_optimizer(
            model=self.net,
            optimizer_type=self.arg_dict["optim"],
            optimizer_info=dict(
                lr=self.arg_dict["lr"],
                momentum=self.arg_dict["momentum"],
                weight_decay=self.arg_dict["weight_decay"],
                nesterov=self.arg_dict["nesterov"],
            ),
        )
        self.sche = make_scheduler(
            optimizer=self.opti,
            total_num=self.iter_num
            if self.arg_dict["sche_usebatch"] else self.end_epoch,
            scheduler_type=self.arg_dict["lr_type"],
            scheduler_info=dict(lr_decay=self.arg_dict["lr_decay"],
                                warmup_epoch=self.arg_dict["warmup_epoch"]),
        )

        # AMP
        if self.arg_dict["use_amp"]:
            construct_print("Now, we will use the amp to accelerate training!")
            from apex import amp

            self.amp = amp
            self.net, self.opti = self.amp.initialize(self.net,
                                                      self.opti,
                                                      opt_level="O1")
        else:
            self.amp = None

        if self.arg_dict["resume_mode"] == "train":
            # resume model to train the model
            self.start_epoch = resume_checkpoint(
                model=self.net,
                optimizer=self.opti,
                scheduler=self.sche,
                amp=self.amp,
                exp_name=self.exp_name,
                load_path=self.path_dict["final_full_net"],
                mode="all",
            )
        else:
            # only train a new model.
            self.start_epoch = 0
Ejemplo n.º 4
0
def main_worker(local_rank, ngpus_per_node, world_size):
    global total_iter_num, batch_size_single_gpu

    if local_rank == 0:
        construct_print(user_config)
        construct_print(f"Project Root: {user_config['proj_root']}")
        construct_print(
            f"Training on: {user_config['rgb_data']['tr_data_path']}")

    # https://github.com/tczhangzhi/pytorch-distributed/issues/4
    init_seed(seed=0)
    init_cudnn(benchmark=(user_config["size_list"] == None))

    if user_config["is_distributed"]:
        init_process(ip=args.ip,
                     port=args.port,
                     rank=local_rank,
                     world_size=world_size)
    torch.cuda.set_device(local_rank)
    batch_size_single_gpu = user_config["batch_size"] // ngpus_per_node

    train_set = ImageFolder(
        root=user_config["rgb_data"]["tr_data_path"],
        in_size=user_config["input_size"],
        training=True,
    )
    train_sampler = (data_dist.DistributedSampler(train_set)
                     if user_config["is_distributed"] else None)
    tr_loader = create_loader(
        data_set=train_set,
        size_list=user_config["size_list"],
        batch_size=batch_size_single_gpu,
        shuffle=(train_sampler == None),
        num_workers=user_config["num_workers"],
        sampler=train_sampler,
        drop_last=True,
        pin_memory=True,
    )
    total_iter_num = user_config["epoch_num"] * len(tr_loader)

    model = getattr(network_lib, user_config["model"])().cuda(local_rank)

    # 单独的测试部分
    if user_config["resume_mode"] == "test":
        if local_rank == 0:
            # resume model only to test model.
            # start_epoch is useless
            resume_checkpoint(
                model=model,
                load_path=path_config["final_full_net"],
                mode="onlynet",
            )
            test(model)
            construct_print("GPU:0 end testing...")
        else:
            construct_print("GPU:1 no testing...")
        return

    optimizer = make_optimizer(
        model=model,
        optimizer_type=user_config["optim"],
        optimizer_info=dict(
            lr=user_config["lr"],
            momentum=user_config["momentum"],
            weight_decay=user_config["weight_decay"],
            nesterov=user_config["nesterov"],
        ),
    )
    scheduler = CustomScheduler(
        optimizer=optimizer,
        total_num=total_iter_num
        if user_config["sche_usebatch"] else user_config["epoch_num"],
        scheduler_type=user_config["lr_type"],
        scheduler_info=dict(lr_decay=user_config["lr_decay"],
                            warmup_epoch=user_config["warmup_epoch"]),
    )
    if local_rank == 0:
        construct_print(f"optimizer = {optimizer}")
        construct_print(f"scheduler = {scheduler}")

    if user_config["is_distributed"]:
        model = convert_syncbn_model(model)
    if user_config["use_amp"]:
        assert cudnn.enabled, "Amp requires cudnn backend to be enabled."
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    if user_config["is_distributed"]:
        model = DDP(model, delay_allreduce=True)

    # 训练部分
    if user_config["resume_mode"] == "train" and local_rank == 0:
        # resume model to train the model
        start_epoch = resume_checkpoint(
            model=model,
            optimizer=optimizer,
            amp=amp if user_config["use_amp"] else None,
            exp_name=exp_name,
            load_path=path_config["final_full_net"],
            mode="all",
            local_rank=local_rank,
        )
    else:
        # only train a new model.
        start_epoch = 0

    loss_funcs = [
        BCEWithLogitsLoss(reduction=user_config["reduction"]).cuda(local_rank)
    ]
    if user_config["use_aux_loss"]:
        from loss.CEL import CEL

        loss_funcs.append(CEL().cuda(local_rank))

    train(
        model=model,
        start_epoch=start_epoch,
        end_epoch=user_config["epoch_num"],
        tr_loader=tr_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        loss_funcs=loss_funcs,
        train_sampler=train_sampler,
        local_rank=local_rank,
    )
    construct_print("End Training...")

    if user_config["is_distributed"]:
        destroy_process()
Ejemplo n.º 5
0
    def __init__(self, args, path):
        super(Solver, self).__init__()
        self.args = args
        self.path = path
        self.dev = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.to_pil = transforms.ToPILImage()
        self.exp_name = args["exp_name"]
        if "@" in self.exp_name:
            network_realname = self.exp_name.split("@")[0]
        else:
            network_realname = self.exp_name
        self.exp_name = self.exp_name.replace("@", "_")

        self.tr_data_path = self.args["rgb_data"]["tr_data_path"]
        self.te_data_list = self.args["rgb_data"]["te_data_list"]

        self.save_path = self.path["save"]
        self.save_pre = self.args["save_pre"]
        if self.args["tb_update"] > 0:
            self.tb = SummaryWriter(self.path["tb"])

        # 依赖与前面属性的属性
        self.pth_path = self.path["final_state_net"]
        self.tr_loader = create_loader(
            data_path=self.tr_data_path,
            mode="train",
            get_length=False,
            prefix=self.args["prefix"],
            size_list=self.args["size_list"],
        )
        self.end_epoch = self.args["epoch_num"]
        self.iter_num = self.end_epoch * len(self.tr_loader)

        if hasattr(network_lib, network_realname):
            self.net = getattr(network_lib, network_realname)().to(self.dev)
        else:
            raise AttributeError
        pprint(self.args)

        self.opti = make_optimizer(
            model=self.net,
            optimizer_type=self.args["optim"],
            optimizer_info=dict(
                lr=self.args["lr"],
                momentum=self.args["momentum"],
                weight_decay=self.args["weight_decay"],
                nesterov=self.args["nesterov"],
            ),
        )
        self.sche = make_scheduler(
            optimizer=self.opti,
            total_num=self.iter_num
            if self.args["sche_usebatch"] else self.end_epoch,
            scheduler_type=self.args["lr_type"],
            scheduler_info=dict(lr_decay=self.args["lr_decay"],
                                warmup_epoch=self.args["warmup_epoch"]),
        )

        if self.args["resume_mode"] == "train":
            # resume model to train the model
            self.start_epoch = resume_checkpoint(
                model=self.net,
                optimizer=self.opti,
                scheduler=self.sche,
                exp_name=self.exp_name,
                load_path=self.path["final_full_net"],
                mode="all",
            )
            self.only_test = False
        elif self.args["resume_mode"] == "test":
            # resume model only to test model.
            # self.start_epoch is useless
            resume_checkpoint(
                model=self.net,
                load_path=self.pth_path,
                mode="onlynet",
            )
            self.only_test = True
        elif not self.args["resume_mode"]:
            # only train a new model.
            self.start_epoch = 0
            self.only_test = False
        else:
            raise NotImplementedError

        if not self.only_test:
            # 损失函数
            self.loss_funcs = [
                BCELoss(reduction=self.args["reduction"]).to(self.dev)
            ]
            if self.args["use_aux_loss"]:
                self.loss_funcs.append(CEL().to(self.dev))