def parse_option(): parser = argparse.ArgumentParser('PSNet scene-segmentation evaluating') parser.add_argument('--cfg', type=str, required=True, help='config file') parser.add_argument('--load_path', required=True, type=str, metavar='PATH', help='path to latest checkpoint') parser.add_argument('--log_dir', type=str, default='log_eval', help='log dir [default: log_eval]') parser.add_argument('--data_root', type=str, default='data', help='root director of dataset') parser.add_argument('--num_workers', type=int, default=4, help='num of workers to use') parser.add_argument('--batch_size', type=int, help='batch_size') parser.add_argument('--num_points', type=int, help='num_points') parser.add_argument('--num_steps', type=int, help='num_steps') parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel') parser.add_argument("--rng_seed", type=int, default=0, help='manual seed') args, unparsed = parser.parse_known_args() update_config(args.cfg) config.data_root = args.data_root config.num_workers = args.num_workers config.load_path = args.load_path config.rng_seed = args.rng_seed config.local_rank = args.local_rank ddir_name = args.cfg.split('.')[-2].split('/')[-1] config.log_dir = os.path.join(args.log_dir, 'psnet', f'{ddir_name}_{int(time.time())}') if args.batch_size: config.batch_size = args.batch_size if args.num_points: config.num_points = args.num_points if args.num_steps: config.num_steps = args.num_steps print(args) print(config) torch.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) random.seed(args.rng_seed) np.random.seed(args.rng_seed) return args, config
def parse_option(): parser = argparse.ArgumentParser("Training and evaluating PartNet") parser.add_argument('--cfg', help='yaml file', type=str) parser.add_argument('--gpus', type=int, default=0, nargs='+', help='gpus to use [default: 0]') parser.add_argument('--num_threads', type=int, default=4, help='num of threads to use') parser.add_argument('--batch_size', type=int, help='batch_size') parser.add_argument('--base_learning_rate', type=float, help='base learning rate for batch size 8') # IO parser.add_argument('--log_dir', default='log', help='log dir [default: log]') parser.add_argument('--load_path', help='path to a check point file for load') parser.add_argument('--print_freq', type=int, help='print frequency') parser.add_argument('--save_freq', type=int, help='save frequency') parser.add_argument('--val_freq', type=int, help='val frequency') # Misc parser.add_argument('--save_memory', action='store_true', help='use memory_saving_gradients') parser.add_argument("--rng-seed", type=int, default=0, help='manual seed') args, _ = parser.parse_known_args() # Update config update_config(args.cfg) ddir_name = args.cfg.split('.')[-2].split('/')[-1] config.log_dir = os.path.join(args.log_dir, 'partnet', f'{ddir_name}_{int(time.time())}') config.load_path = args.load_path config.gpus = args.gpus if isinstance(args.gpus, list) else [args.gpus] config.num_gpus = len(config.gpus) if args.num_threads: config.num_threads = args.num_threads else: cpu_count = psutil.cpu_count() gpu_count = str(subprocess.check_output(["nvidia-smi", "-L"])).count('UUID') config.num_threads = config.num_gpus * cpu_count // gpu_count if args.batch_size: config.batch_size = args.batch_size if args.base_learning_rate: config.base_learning_rate = args.base_learning_rate if args.print_freq: config.print_freq = args.print_freq if args.save_freq: config.save_freq = args.save_freq if args.val_freq: config.save_freq = args.val_freq # Set manual seed tf.set_random_seed(args.rng_seed) np.random.seed(args.rng_seed) # If args.save_memory is True, use gradient-checkpointing to save memory if args.save_memory: # if save memory import utils.memory_saving_gradients tf.__dict__["gradients"] = utils.memory_saving_gradients.gradients_collection return args, config
def main(): update_config("configs/coco/resnet_v1_101_coco_trainval_fpn_dcn_end2end_ohem.yaml") log_init(filename=config.TRAIN.model_prefix + "train.log") msg = pprint.pformat(config) logging.info(msg) os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"] = "0" os.environ["MXNET_GPU_MEM_POOL_TYPE"] = "Round" ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] ctx = ctx * config.network.IM_PER_GPU train_net(ctx, config.TRAIN.begin_epoch, config.TRAIN.lr, config.TRAIN.lr_step)
def update_config(self, config): r""" Update the object config with new inputs. Args: config (dict or BaseConfig) : fields of configuration to be updated Typically if config = {"learningRate": 0.1} only the learning rate will be changed. """ update_config(self.config, config) self.updateSolversDevice()
def main(): update_config( "configs/voc/resnet_v1_50_voc0712_rfcn_dcn_end2end_ohem_one_gpu.yaml") log_init(filename=config.TRAIN.model_prefix + "train.log") msg = pprint.pformat(config) logging.info(msg) os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"] = "0" os.environ["MXNET_GPU_MEM_POOL_TYPE"] = "Round" ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] train_net(ctx, config.network.pretrained, config.network.pretrained_epoch, config.TRAIN.model_prefix, config.TRAIN.begin_epoch, config.TRAIN.end_epoch, config.TRAIN.lr, config.TRAIN.lr_step)
def parse_option(): parser = argparse.ArgumentParser("Evaluating S3DIS") parser.add_argument('--cfg', help='yaml file', type=str) parser.add_argument('--gpu', type=int, default=0, help='which gpu to use [default: 0]') parser.add_argument('--num_threads', type=int, default=4, help='num of threads to use') parser.add_argument('--batch_size', type=int, help='batch_size') parser.add_argument('--base_learning_rate', type=float, help='base learning rate for batch size 8') # IO parser.add_argument('--log_dir', default='log_eval', help='log dir [default: log]') parser.add_argument('--load_path', help='path to a check point file for load') # Misc parser.add_argument("--rng-seed", type=int, default=0, help='manual seed') args, _ = parser.parse_known_args() # Update config update_config(args.cfg) ddir_name = args.cfg.split('.')[-2].split('/')[-1] config.log_dir = os.path.join(args.log_dir, 's3dis', f'{ddir_name}_{int(time.time())}') config.load_path = args.load_path if args.num_threads: config.num_threads = args.num_threads else: cpu_count = psutil.cpu_count() config.num_threads = cpu_count if args.batch_size: config.batch_size = args.batch_size # Set manual seed tf.set_random_seed(args.rng_seed) np.random.seed(args.rng_seed) return args, config
def cli_main(): parser = options.get_validation_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority' ) parser.add_argument( '--load-checkpoint', type=str, help='path to checkpoint to load (possibly composite) model from') pre_parsed_args = parser.parse_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) main(args)
# cocoEval.accumulate() # cocoEval.summarize() # mAP_eachclasses[catid2catbane[catId]] = cocoEval.stats[1] print(u"Evaluate all classes.") cocoEval.params.catIds = catIds cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() mAP_eachclasses[u"mAP@IoU=0.5"] = cocoEval.stats[1] print("************summary***************") for k in mAP_eachclasses.keys(): print(k, mAP_eachclasses[k]) if __name__ == '__main__': update_config( "configs/coco/resnet_v1_101_coco_trainval_fpn_dcn_end2end_ohem.yaml") backbone = SEResNext50_32x4d() net = PyramidRFCN(config, backbone) params_pretrained = mx.nd.load("output/fpn_coco-5-0.0.params") for k in params_pretrained: params_pretrained[k.replace("arg:", "").replace("aux:", "")] = params_pretrained.pop(k) params = net.collect_params() for k in params.keys(): if k in params_pretrained.keys(): params[k]._load_init(params_pretrained[k], ctx=mx.cpu()) else: print(k) results = {}
def parse_args(): parser = argparse.ArgumentParser( description='Train and val for occlusion edge/order detection') parser.add_argument('--config', default='', required=False, type=str, help='experiment configure file name') args, rest = parser.parse_known_args() update_config(args.config) # update params with experiment config file parser.add_argument('--debug', action='store_true', help='debug mode') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument( '--new_val', action='store_true', help='new val with resumed model, re-calculate val perf ') parser.add_argument('--out_dir', default='', type=str, metavar='PATH', help='res output dir(defaut: output/date)') parser.add_argument('--evaluate', action='store_true', help='test with best model in validation') parser.add_argument('--frequent', default=config.default.frequent, type=int, help='frequency of logging') parser.add_argument('--gpus', help='specify the gpu to be use', default='3', required=False, type=str) parser.add_argument('--cpu', default=False, required=False, type=bool, help='whether use cpu mode') parser.add_argument('-j', '--workers', default=2, type=int, metavar='N', help='number of data loading workers') parser.add_argument('--vis', action='store_true', help='turn on visualization') parser.add_argument( '--arch', '-a', metavar='ARCH', choices=model_names, help='model architecture, overwritten if pretrained is specified: ' + ' | '.join(model_names)) args = parser.parse_args() return args
criterion = MaskedCrossEntropy() return criterion if __name__ == "__main__": # obtain config import argparse from utils.config import config, update_config parser = argparse.ArgumentParser('S3DIS semantic segmentation training') parser.add_argument('--cfg', type=str, default='project/cfgs/s3dis/pointnet.yaml', help='config file') args, unparsed = parser.parse_known_args() # update config dict with the yaml file update_config(args.cfg) print(config) # create a model model = PointNetSemSeg(config, config.input_features_dim) print(model) # define a loss from losses import MaskedCrossEntropy criterion = MaskedCrossEntropy() # create a random input and then predict batch_size = 2 # config.batch_size num_points = config.num_points input_features_dim = config.input_features_dim xyz = torch.rand(batch_size, num_points, 3)
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument( '--filter_best_last_ckpts', type=str, default=False, help= 'whether to filter out checkpoint_best and checkpoint_last from checkpoint list' ) parser.add_argument('--log_valid_progress', type=str, default=False, help='whether to log validation progress') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
fetch_list=[avg_cost, auc_var], fetch_info=['Epoch {} cost: '.format(epoch + 1), ' - auc: '], print_period=cfg.log_interval, debug=False) end_time = time.time() logger.info("epoch %d finished, use time = %ds \n" % ((epoch + 1), end_time - start_time)) if (epoch + 1) % cfg.save_interval == 0: model_path = os.path.join(str(cfg.save_path), model.name, model.name + "_epoch_" + str(epoch + 1)) if not os.path.isdir(model_path): os.makedirs(model_path) logger.info("saving model to %s \n" % (model_path)) fluid.save(fluid.default_main_program(), os.path.join(model_path, "checkpoint")) logger.info("Done.") def main(): train() if __name__ == '__main__': option = BaseOptions() args = option.initialize() update_config(cfg, args) print_config(cfg) main()
help='Logging with tensorboard', action='store_true') parser.add_argument('--debug', default=False, dest='debug', help='Visualization debug', action='store_true') parser.add_argument('--map', default=True, dest='map', help='Evaluate mAP per epoch', action='store_true') opt = parser.parse_args() cfg_file_name = os.path.basename(opt.cfg) cfg = update_config(opt.cfg) cfg['FILE_NAME'] = cfg_file_name cfg.TRAIN.DPG_STEP = [i - cfg.TRAIN.DPG_MILESTONE for i in cfg.TRAIN.DPG_STEP] opt.world_size = cfg.TRAIN.WORLD_SIZE opt.work_dir = './exp/{}-{}/'.format(opt.exp_id, cfg_file_name) opt.gpus = [i for i in range(torch.cuda.device_count())] opt.device = torch.device("cuda:" + str(opt.gpus[0]) if opt.gpus[0] >= 0 else "cpu") if not os.path.exists("./exp/{}-{}".format(opt.exp_id, cfg_file_name)): os.makedirs("./exp/{}-{}".format(opt.exp_id, cfg_file_name)) filehandler = logging.FileHandler('./exp/{}-{}/training.log'.format( opt.exp_id, cfg_file_name)) streamhandler = logging.StreamHandler()
def train( query_dataloader, retrieval_dataloader, code_length, args, # args.device, # lr, # args.max_iter, # args.max_epoch, # args.num_samples, # args.batch_size, # args.root, # dataset, # args.gamma, # topk, ): """ Training model. Args query_dataloader, retrieval_dataloader(torch.utils.data.dataloader.DataLoader): Data loader. code_length(int): Hashing code length. args.device(torch.args.device): GPU or CPU. lr(float): Learning rate. args.max_iter(int): Number of iterations. args.max_epoch(int): Number of epochs. num_train(int): Number of sampling training data points. args.batch_size(int): Batch size. args.root(str): Path of dataset. dataset(str): Dataset name. args.gamma(float): Hyper-parameters. topk(int): Topk k map. Returns mAP(float): Mean Average Precision. """ # Initialization # model = alexnet.load_model(code_length).to(args.device) # model = resnet.resnet50(pretrained=args.pretrain, num_classes=code_length).to(args.device) update_config(config, args) # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED model = hrnet.get_cls_net(config, pretrained=args.pretrain, num_classes=code_length).to(args.device) # print(model) if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, args.lr_step) criterion = ADSH_Loss(code_length, args.gamma) num_retrieval = len(retrieval_dataloader.dataset) U = torch.zeros(args.num_samples, code_length).to(args.device) B = torch.randn(num_retrieval, code_length).to(args.device) retrieval_targets = retrieval_dataloader.dataset.get_onehot_targets().to( args.device) cnn_losses, hash_losses, quan_losses = AverageMeter(), AverageMeter( ), AverageMeter() start = time.time() best_mAP = 0 for it in range(args.max_iter): iter_start = time.time() # Sample training data for cnn learning train_dataloader, sample_index = sample_dataloader( retrieval_dataloader, args.num_samples, args.batch_size, args.root, args.dataset) # Create Similarity matrix train_targets = train_dataloader.dataset.get_onehot_targets().to( args.device) S = (train_targets @ retrieval_targets.t() > 0).float() S = torch.where(S == 1, torch.full_like(S, 1), torch.full_like(S, -1)) # Soft similarity matrix, benefit to converge r = S.sum() / (1 - S).sum() S = S * (1 + r) - r # Training CNN model for epoch in range(args.max_epoch): cnn_losses.reset() hash_losses.reset() quan_losses.reset() for batch, (data, targets, index) in enumerate(train_dataloader): data, targets, index = data.to(args.device), targets.to( args.device), index.to(args.device) optimizer.zero_grad() F = model(data) U[index, :] = F.data cnn_loss, hash_loss, quan_loss = criterion( F, B, S[index, :], sample_index[index]) cnn_losses.update(cnn_loss.item()) hash_losses.update(hash_loss.item()) quan_losses.update(quan_loss.item()) cnn_loss.backward() optimizer.step() logger.info( '[epoch:{}/{}][cnn_loss:{:.6f}][hash_loss:{:.6f}][quan_loss:{:.6f}]' .format(epoch + 1, args.max_epoch, cnn_losses.avg, hash_losses.avg, quan_losses.avg)) scheduler.step() # Update B expand_U = torch.zeros(B.shape).to(args.device) expand_U[sample_index, :] = U B = solve_dcc(B, U, expand_U, S, code_length, args.gamma) # Total loss iter_loss = calc_loss(U, B, S, code_length, sample_index, args.gamma) # logger.debug('[iter:{}/{}][loss:{:.2f}][iter_time:{:.2f}]'.format(it+1, args.max_iter, iter_loss, time.time()-iter_start)) logger.info('[iter:{}/{}][loss:{:.6f}][iter_time:{:.2f}]'.format( it + 1, args.max_iter, iter_loss, time.time() - iter_start)) # Evaluate if (it + 1) % 1 == 0: query_code = generate_code(model, query_dataloader, code_length, args.device) mAP = evaluate.mean_average_precision( query_code.to(args.device), B, query_dataloader.dataset.get_onehot_targets().to(args.device), retrieval_targets, args.device, args.topk, ) if mAP > best_mAP: best_mAP = mAP # Save checkpoints ret_path = os.path.join('checkpoints', args.info, str(code_length)) # ret_path = 'checkpoints/' + args.info if not os.path.exists(ret_path): os.makedirs(ret_path) torch.save(query_code.cpu(), os.path.join(ret_path, 'query_code.t')) torch.save(B.cpu(), os.path.join(ret_path, 'database_code.t')) torch.save(query_dataloader.dataset.get_onehot_targets, os.path.join(ret_path, 'query_targets.t')) torch.save(retrieval_targets.cpu(), os.path.join(ret_path, 'database_targets.t')) torch.save(model.cpu(), os.path.join(ret_path, 'model.t')) model = model.to(args.device) logger.info( '[iter:{}/{}][code_length:{}][mAP:{:.5f}][best_mAP:{:.5f}]'. format(it + 1, args.max_iter, code_length, mAP, best_mAP)) logger.info('[Training time:{:.2f}]'.format(time.time() - start)) return best_mAP
from copy import deepcopy from numpy import mean from utils.config import Config, update_config from utils.gridsearch import GridSearch from utils.utils import get_kernel, get_classifier, get_sets, save_submission, kfold, split_train_val from kernels.default import SimpleMKL from utils.pca import PCA glob_cfg = Config('config') # Start by updating the glob_cfg set0, set1 and set2 with global glob_cfg # Deepcopy is used because update_config has some side effects... glob_cfg.set_( "set0", update_config(deepcopy(glob_cfg["global"].values_()), glob_cfg.set0.values_())) glob_cfg.set_( "set1", update_config(deepcopy(glob_cfg["global"].values_().copy()), glob_cfg.set1.values_())) glob_cfg.set_( "set2", update_config(deepcopy(glob_cfg["global"].values_().copy()), glob_cfg.set2.values_())) gridsearch = GridSearch(glob_cfg) total_accuracy = [] all_predictions = [] all_ids = []
predicts = [evaluator(data[0].as_in_context(config.ctx))] predicts = [pred.argmax(1).asnumpy().squeeze() for pred in predicts] targets = [target.as_in_context(mx.cpu()).asnumpy().squeeze() \ for target in dsts] metric.update(targets, predicts) pixAcc, mIoU, IoUs = metric.get() iou_str = "" for ind, cur_class in enumerate(testset.classes): iou_str += "%s: %.3f\t" % (cur_class, IoUs[ind]) logging.info( 'pixAcc: %.4f, mIoU: %.4f\n%s' % (pixAcc, mIoU, iou_str)) if __name__ == "__main__": args = parse_args() print("Using config file %s" % args.cfg) config.update_config(args.cfg) config.config.resume = args.resume config.config.gpu = args.gpu config.config.ctx = args.ctx config.config.test_batch_size = 1 tag = args.cfg.split("/")[-1] tag = tag.replace(".yaml", "") config.config.tag = tag logging.basicConfig(filename=config.config.tag+"_eval.log", level=logging.INFO) console = logging.StreamHandler() logging.getLogger().addHandler(console) logging.info("Parameters:") logging.info(config.config)
if __name__ == '__main__': parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help='paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--torch-file-system', action='store_true') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) # set sharing strategy file system in case /dev/shm/ limits are small if args.torch_file_system: torch.multiprocessing.set_sharing_strategy('file_system') main(args)
def parse_option(): parser = argparse.ArgumentParser('PartNet part-segmentation training') parser.add_argument('--cfg', type=str, required=True, help='config file') parser.add_argument('--data_root', type=str, default='data', help='root director of dataset') parser.add_argument('--num_workers', type=int, default=4, help='num of workers to use') parser.add_argument('--batch_size', type=int, help='batch_size') parser.add_argument('--base_learning_rate', type=float, help='base learning rate') parser.add_argument('--epochs', type=int, help='number of training epochs') parser.add_argument('--start_epoch', type=int, help='used for resume') # io parser.add_argument('--load_path', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--print_freq', type=int, default=10, help='print frequency') parser.add_argument('--save_freq', type=int, default=10, help='save frequency') parser.add_argument('--val_freq', type=int, default=10, help='val frequency') parser.add_argument('--log_dir', type=str, default='log', help='log dir [default: log]') # misc parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel') parser.add_argument("--rng_seed", type=int, default=0, help='manual seed') args, unparsed = parser.parse_known_args() update_config(args.cfg) config.data_root = args.data_root config.num_workers = args.num_workers config.load_path = args.load_path config.print_freq = args.print_freq config.save_freq = args.save_freq config.val_freq = args.val_freq config.rng_seed = args.rng_seed config.local_rank = args.local_rank ddir_name = args.cfg.split('.')[-2].split('/')[-1] config.log_dir = os.path.join(args.log_dir, 'partnet', ddir_name) if args.batch_size: config.batch_size = args.batch_size if args.base_learning_rate: config.base_learning_rate = args.base_learning_rate if args.epochs: config.epochs = args.epochs if args.start_epoch: config.start_epoch = args.start_epoch print(args) print(config) torch.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) random.seed(args.rng_seed) np.random.seed(args.rng_seed) return args, config
def load_state_dict(self, in_state, loadG=True, loadD=True, loadConfig=True, finetuning=False): r""" Load a model saved with the @method save() function Args: - in_state (dict): state dict containing the model """ # Step one : load the configuration if loadConfig: update_config(self.config, in_state['config']) self.lossCriterion = getattr( base_loss_criterions, self.config.lossCriterion)(self.device) self.initializeClassificationCriterion() # Re-initialize G and D with the loaded configuration buildAvG = True if loadG: self.netG = self.getNetG() if finetuning: loadPartOfStateDict(self.netG, in_state['netG'], ["formatLayer"]) self.getOriginalG().initFormatLayer( self.config.latentVectorDim) else: # Replace me by a standard loadStateDictCompatibletedict for open-sourcing TODO loadStateDictCompatible(self.netG, in_state['netG']) if 'avgG' in in_state: print("Average network found !") self.buildAvG() # Replace me by a standard loadStatedict for open-sourcing if isinstance(self.avgG, nn.DataParallel): # loadStateDictCompatible(self.avgG.module, in_state['avgG']) # HACK TO BE ABLE TO LOAD THE MODELS TRAINED SO FAR loadStateDictCompatible(self.avgG.module, in_state['avgG']) else: loadStateDictCompatible(self.avgG, in_state['avgG']) buildAvG = False if loadD: self.netD = self.getNetD() if finetuning: loadPartOfStateDict(self.netD, in_state['netD'], ["decisionLayer"]) self.getOriginalD().initDecisionLayer( self.lossCriterion.sizeDecisionLayer + self.config.categoryVectorDim) else: # Replace me by a standard loadStatedict for open-sourcing TODO loadStateDictCompatible(self.netD, in_state['netD']) elif 'tmp' in in_state.keys(): self.trainTmp = in_state['tmp'] # Don't forget to reset the machinery ! self.updateSolversDevice(buildAvG)
def parse_config(): """load configs including parameters from dataset, model, training, etc. The basic process is: - load default settings based on the config dict in the utils/config.py - update the config dict using yaml file specified by an argparse argument(--cfg argument) - update the config dict using argparse arguments Returns: tuple: (args, config) contains config settings where args is argparse.Namespace object while config is a dict """ parser = argparse.ArgumentParser('S3DIS semantic segmentation training') parser.add_argument('--cfg', type=str, default='project/cfgs/s3dis/pointnet2_ssg.yaml', help='config file') # parser.add_argument('--model_name', type=str, default='', help='model name, pointnet, pointnet2ssg, pointnet2msg') parser.add_argument('--data_root', type=str, default='data', help='root director of dataset') parser.add_argument('--num_workers', type=int, default=4, help='num of workers to use') parser.add_argument('--batch_size', type=int, help='batch_size') parser.add_argument('--num_points', type=int, help='num_points') parser.add_argument('--num_steps', type=int, help='num_steps') parser.add_argument('--base_learning_rate', type=float, help='base learning rate') parser.add_argument('--weight_decay', type=float, help='weight_decay') parser.add_argument('--epochs', type=int, help='number of training epochs') parser.add_argument('--start_epoch', type=int, help='used for resume') # io parser.add_argument('--load_path', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--print_freq', type=int, default=10, help='print frequency') parser.add_argument('--save_freq', type=int, default=10, help='save frequency') parser.add_argument('--val_freq', type=int, default=10, help='val frequency') parser.add_argument('--log_dir', type=str, default='log', help='log dir [default: log]') # misc parser.add_argument("--local_rank", type=int, default=0, help='local rank for DistributedDataParallel') parser.add_argument("--rng_seed", type=int, default=0, help='manual seed') args, unparsed = parser.parse_known_args() # update config dict with the yaml file update_config(args.cfg) # update config dict with args arguments config.data_root = args.data_root config.num_workers = args.num_workers config.load_path = args.load_path config.print_freq = args.print_freq config.save_freq = args.save_freq config.val_freq = args.val_freq config.rng_seed = args.rng_seed config.local_rank = args.local_rank model_name = args.cfg.split('.')[-2].split('/')[ -1] # model name, e.g., pointnet # supports: pointnet,pointnet2{ssg,msg} config.model_name = model_name current_time = datetime.now().strftime( '%Y%m%d%H%M%S') #20210518221044 means 2021, 5.18, 22:10:44 config.log_dir = os.path.join(args.log_dir, 's3dis', f'{model_name}_{int(current_time)}' ) ## log_dir=log/s3dis/pointnet_time if args.batch_size: config.batch_size = args.batch_size if args.num_points: config.num_points = args.num_points if args.num_steps: config.num_steps = args.num_steps if args.base_learning_rate: config.base_learning_rate = args.base_learning_rate if args.weight_decay: config.weight_decay = args.weight_decay if args.epochs: config.epochs = args.epochs if args.start_epoch: config.start_epoch = args.start_epoch print(args) print(config) torch.manual_seed(args.rng_seed) torch.cuda.manual_seed_all(args.rng_seed) random.seed(args.rng_seed) np.random.seed(args.rng_seed) return args, config
GANTrainer = trainerModule(model_name=exp_name, gpu=GPU_is_available(), loader=loader, loss_plot_i=args.loss_i, eval_i=args.eval_i, checkpoint_dir=checkpoint_dir, save_iter=args.save_i, n_samples=args.n_samples, config=model_config, vis_manager=vis_manager) # If a checkpoint is found, load it if not args.restart and checkpoint_state is not None: train_config, model_path, tmp_data_path = checkpoint_state # if args.retrain: # train_config_file = read_json(train_config) # for k, v in config['model_config'].items(): # train_config_file[k] = v # train_config = os.path.join(checkpoint_dir, f'{exp_name}_train_config.json') # save_json(train_config_file, train_config) GANTrainer.load_saved_training(model_path, train_config, tmp_data_path) if args.finetune: GANTrainer.model.update_config(model_config) update_config(GANTrainer.modelConfig, model_config) # ipdb.set_trace() # save config file save_json(config, os.path.join(checkpoint_dir, f'{exp_name}_config.json')) GANTrainer.train()
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument('--torch-file-system', action='store_true') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) # set sharing strategy file system in case /dev/shm/ limits are small if args.torch_file_system: torch.multiprocessing.set_sharing_strategy('file_system') training_name = get_training_name(args) base_save_dir = generate_save_dir(args, training_name, sys.argv[1:]) setattr(args, 'training_name', training_name) setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints')) setattr(args, 'tensorboard_logdir', os.path.join(base_save_dir, 'tensorboard')) save_config(vars(args), base_save_dir) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)