def main(gpu, ngpus_per_node, args): # 将gpu复制到args里 args.gpu = gpu if args.distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) rank = args.rank # 读取配置文件 # cfg_file是必须传入的要训练模型名字 cfg_file = os.path.join("./configs", args.cfg_file + ".json") with open(cfg_file, "r") as f: config = json.load(f) # 上面的读取的单独模型配置的"system"里添加一项 config["system"]["snapshot_name"] = args.cfg_file # 更新参数配置 system_config = SystemConfig().update_config(config["system"]) # 模型的名字作为模型导入 model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model() # 从i系统配置里取出的参数 train_split = system_config.train_split val_split = system_config.val_split print("Process {}: loading all datasets...".format(rank)) # 用了几个worker dataset = system_config.dataset workers = args.workers print("Process {}: using {} workers".format(rank, workers)) training_dbs = [ datasets[dataset](config["db"], split=train_split, sys_config=system_config) for _ in range(workers) ] validation_db = datasets[dataset](config["db"], split=val_split, sys_config=system_config) if rank == 0: print("system config...") pprint.pprint(system_config.full) print("db config...") pprint.pprint(training_dbs[0].configs) print("len of db: {}".format(len(training_dbs[0].db_inds))) print("distributed: {}".format(args.distributed)) # 调用train训练函数 train(training_dbs, validation_db, system_config, model, args)
def Setup(self): distributed = self.system_dict["model"]["params"]["distributed"] world_size = self.system_dict["model"]["params"]["world_size"] ngpus_per_node = torch.cuda.device_count() current_dir = os.path.dirname(os.path.realpath(__file__)); cfg_file = os.path.join(current_dir, "configs", self.system_dict["model"]["params"]["cfg_file"] + ".json") with open(cfg_file, "r") as f: self.system_dict["local"]["config"] = json.load(f) self.system_dict["local"]["config"]["db"]["root_dir"] = self.system_dict["dataset"]["train"]["root_dir"]; self.system_dict["local"]["config"]["db"]["coco_dir"] = self.system_dict["dataset"]["train"]["coco_dir"]; self.system_dict["local"]["config"]["db"]["img_dir"] = self.system_dict["dataset"]["train"]["img_dir"]; self.system_dict["local"]["config"]["db"]["set_dir"] = self.system_dict["dataset"]["train"]["set_dir"]; f = open(self.system_dict["dataset"]["train"]["root_dir"] + "/" + self.system_dict["dataset"]["train"]["coco_dir"] + "/annotations/classes.txt"); lines = f.readlines(); f.close(); self.system_dict["local"]["config"]["db"]["categories"] = len(lines); self.system_dict["local"]["config"]["system"]["batch_size"] = self.system_dict["dataset"]["params"]["batch_size"]; self.system_dict["local"]["config"]["system"]["chunk_sizes"] = [self.system_dict["dataset"]["params"]["batch_size"]]; self.system_dict["local"]["config"]["system"]["max_iter"] = self.system_dict["training"]["params"]["total_iterations"]; self.system_dict["local"]["config"]["system"]["snapshot_name"] = self.system_dict["model"]["params"]["cfg_file"] self.system_dict["local"]["system_config"] = SystemConfig().update_config(self.system_dict["local"]["config"]["system"]) self.system_dict["local"]["training_dbs"] = [datasets[self.system_dict["local"]["system_config"].dataset](self.system_dict["local"]["config"]["db"], sys_config=self.system_dict["local"]["system_config"]) for _ in range(self.system_dict["dataset"]["params"]["workers"])] if(self.system_dict["dataset"]["val"]["status"]): self.system_dict["local"]["config"]["db"]["root_dir"] = self.system_dict["dataset"]["val"]["root_dir"]; self.system_dict["local"]["config"]["db"]["coco_dir"] = self.system_dict["dataset"]["val"]["coco_dir"]; self.system_dict["local"]["config"]["db"]["img_dir"] = self.system_dict["dataset"]["val"]["img_dir"]; self.system_dict["local"]["config"]["db"]["set_dir"] = self.system_dict["dataset"]["val"]["set_dir"]; self.system_dict["local"]["validation_db"] = datasets[self.system_dict["local"]["system_config"].dataset](self.system_dict["local"]["config"]["db"], sys_config=self.system_dict["local"]["system_config"]) if(not os.path.isdir("cache/")): os.mkdir("cache"); if(not os.path.isdir("cache/nnet")): os.mkdir("cache/nnet/"); if(not os.path.isdir("cache/nnet/" + self.system_dict["model"]["params"]["cfg_file"])): os.mkdir("cache/nnet/" + self.system_dict["model"]["params"]["cfg_file"]); model_file = "core.models.{}".format(self.system_dict["model"]["params"]["cfg_file"]) print("Loading Model - {}".format(model_file)) model_file = importlib.import_module(model_file) self.system_dict["local"]["model"] = model_file.model(self.system_dict["local"]["config"]["db"]["categories"]) print("Model Loaded");
def main(args): # 后缀的与否,以及整个配置文件在config文件夹下 if args.suffix is None: cfg_file = os.path.join("./configs", args.cfg_file + ".json") else: cfg_file = os.path.join("./configs", args.cfg_file + "-{}.json".format(args.suffix)) print("\033[1;36m cfg_file(模型配置文件): \033[0m {} ".format(cfg_file)) # 使用json.load读取json配置文件 with open(cfg_file, "r") as f: config = json.load(f) # 添加快照的配置,并在完成后生成系统配置类的对象 config["system"]["snapshot_name"] = args.cfg_file system_config = SystemConfig().update_config(config["system"]) # 模型文件名生成 导入模型 初始化模型 model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model() # 考虑训练步长、验证步长和测试步长 train_split = system_config.train_split val_split = system_config.val_split test_split = system_config.test_split # 默认使用的是validation的split分割 # print(train_split) # print(args.split) split = { "train": train_split, "valid": val_split, "test": test_split }[args.split] print("\033[0;36m loading all datasets(加载所有数据集中)... \033[0m ") dataset = system_config.dataset print("\033[1;36m split(使用分割): \033[0m {}".format(split)) testing_db = datasets[dataset](config["db"], split=split, sys_config=system_config) print("\033[1;36m 生成数据模型: \033[0m {}".format(testing_db)) print("\033[0;36m system config(系统配置)...\033[0m ") pprint.pprint(system_config.full) print("\033[0;36m db config(数据集配置)...\033[0m ") pprint.pprint(testing_db.configs) test(testing_db, system_config, model, args)
def main(gpu, ngpus_per_node, args): args.gpu = gpu if args.distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) rank = args.rank cfg_file = os.path.join("./configs", args.cfg_file + ".json") with open(cfg_file, "r") as f: config = json.load(f) config["system"]["snapshot_name"] = args.cfg_file system_config = SystemConfig().update_config(config["system"]) model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model() # VALIDATE # hg_model = model.hg # for param in hg_model.parameters(): # param.requires_grad = False # hg_model.eval() # print('TESTING MODEL END HERE') train_split = system_config.train_split val_split = system_config.val_split print("Process {}: loading all datasets...".format(rank)) dataset = system_config.dataset workers = args.workers print("Process {}: using {} workers".format(rank, workers)) training_dbs = [datasets[dataset]( config["db"], split=train_split, sys_config=system_config) for _ in range(workers)] validation_db = datasets[dataset]( config["db"], split=val_split, sys_config=system_config) if rank == 0: print("system config...") pprint.pprint(system_config.full) print("db config...") pprint.pprint(training_dbs[0].configs) print("len of db: {}".format(len(training_dbs[0].db_inds))) print("distributed: {}".format(args.distributed)) train(training_dbs, validation_db, system_config, model, args)
def main(args): # os.environ["CUDA_VISIBLE_DEVICES"] = "0" args.gpu = None cfg_file = os.path.join("./configs", args.cfg_file + ".json") with open(cfg_file, "r") as f: config = json.load(f) print("load cfg file: {}".format(cfg_file)) #update fields in config.py by the json file config["system"]["snapshot_name"] = args.cfg_file system_config = SystemConfig().update_config(config["system"]) #init model according the config file name model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model() #set train and val dataset name train_split = system_config.train_split val_split = system_config.val_split print("loading all datasets...") dataset = system_config.dataset workers = args.workers print("using {} workers".format(workers)) training_dbs = [ datasets[dataset](config["db"], split=train_split, sys_config=system_config) for _ in range(workers) ] validation_db = datasets[dataset](config["db"], split=val_split, sys_config=system_config) print("system config...") pprint.pprint(system_config.full) print("db config...") pprint.pprint(training_dbs[0].configs) print("len of db: {}".format(len(training_dbs[0].db_inds))) train(training_dbs, validation_db, system_config, model, args)
def main(args): # os.environ["CUDA_VISIBLE_DEVICES"] = "2" if args.suffix is None: cfg_file = os.path.join("./configs", args.cfg_file + ".json") else: cfg_file = os.path.join("./configs", args.cfg_file + "-{}.json".format(args.suffix)) print("cfg_file: {}".format(cfg_file)) with open(cfg_file, "r") as f: config = json.load(f) config["system"]["snapshot_name"] = args.cfg_file system_config = SystemConfig().update_config(config["system"]) model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model() train_split = system_config.train_split val_split = system_config.val_split test_split = system_config.test_split split = { "training": train_split, "validation": val_split, "testing": test_split }[args.split] print("loading all datasets...") dataset = system_config.dataset print("split: {}".format(split)) testing_db = datasets[dataset](config["db"], split=split, sys_config=system_config) print("system config...") pprint.pprint(system_config.full) print("db config...") pprint.pprint(testing_db.configs) test(testing_db, system_config, model, args)
def main(gpu, ngpus_per_node, args): args.gpu = gpu if args.distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) rank = args.rank cfg_file = os.path.join("./configs", args.cfg_file + ".json") with open(cfg_file, "r") as f: config = json.load(f) config["system"]["snapshot_name"] = args.cfg_file system_config = SystemConfig().update_config(config["system"]) model_file = "core.models.{}".format(args.cfg_file) model_file = importlib.import_module(model_file) model = model_file.model(num_classes=config["db"]["categories"]) train_split = system_config.train_split val_split = system_config.val_split ckpt_path = os.path.join('cache/nnet/', args.cfg_file, date) train_logger = pLogger(ckpt_path) if not os.path.exists(ckpt_path): os.makedirs(os.path.join(ckpt_path)) shutil.copyfile('{}'.format(cfg_file), '{}/{}'.format(ckpt_path, args.cfg_file + ".json")) train_logger.train_logging( "Process {}: loading all datasets...".format(rank)) dataset = system_config.dataset workers = args.workers train_logger.train_logging("Process {}: using {} workers".format( rank, workers)) training_dbs = [ datasets[dataset](config["db"], split=train_split, sys_config=system_config) for _ in range(workers) ] validation_db = datasets[dataset](config["db"], split=val_split, sys_config=system_config) if rank == 0: print("system config...") pprint.pprint(system_config.full) train_logger.train_logging("system config...") train_logger.train_logging(system_config.full) print("db config...") pprint.pprint(training_dbs[0].configs) train_logger.train_logging("db config...") train_logger.train_logging(training_dbs[0].configs) train_logger.train_logging("len of db: {}".format( len(training_dbs[0].db_inds))) train_logger.train_logging("distributed: {}".format(args.distributed)) train(train_logger, training_dbs, validation_db, system_config, model, args)
import torch from core.models.CornerNet_Saccade import model from core.paths import get_file_path from core.base import load_cfg, load_nnet from core.config import SystemConfig cfg_path = get_file_path("..", "configs", "CornerNet_Saccade.json") model_path = get_file_path("nnet", "CornerNet_Saccade_500000.pkl") cfg_sys, cfg_db = load_cfg(cfg_path) sys_cfg = SystemConfig().update_config(cfg_sys) cornernet = load_nnet(sys_cfg, model()) example = torch.rand(1, 3, 224, 224).cuda() torch_out = torch.onnx.export(cornernet.model, example, "test.onnx", verbose=True ) print("onnx done")