def setup_env(): """Sets up environment for training or testing.""" if dist.is_main_proc(): # Ensure that the output dir exists pathmgr.mkdirs(cfg.OUT_DIR) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log torch, cuda, and cudnn versions version = [ torch.__version__, torch.version.cuda, torch.backends.cudnn.version() ] logger.info( "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) env = "".join( [f"{key}: {value}\n" for key, value in sorted(os.environ.items())]) logger.info(f"os.environ:\n{env}") # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else () logger.info(logging.dump_log_data(cfg, "cfg", None)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def main(): # Load config options config.load_cfg_fom_args("Train a classification model.") config.assert_and_infer_cfg() cfg.freeze() # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Perform training if cfg.NUM_GPUS > 1: dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=train_model) else: train_model()
def main(): """Execute operation (train, test, time, etc.).""" args = parse_args() mode = args.mode config.load_cfg(args.cfg) cfg.merge_from_list(args.opts) config.assert_cfg() cfg.freeze() if mode == "info": print(builders.get_model()()) print("complexity:", net.complexity(builders.get_model())) elif mode == "train": dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.train_model) elif mode == "test": dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.test_model) elif mode == "time": dist.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=trainer.time_model) elif mode == "scale": cfg.defrost() cx_orig = net.complexity(builders.get_model()) scaler.scale_model() cx_scaled = net.complexity(builders.get_model()) cfg_file = config.dump_cfg() print("Scaled config dumped to:", cfg_file) print("Original model complexity:", cx_orig) print("Scaled model complexity:", cx_scaled)
def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def main(): config.load_cfg_fom_args("Scale a model.") config.assert_and_infer_cfg() cx_orig = net.complexity(builders.get_model()) scaler.scale_model() cx_scaled = net.complexity(builders.get_model()) cfg_file = config.dump_cfg() print("Scaled config dumped to:", cfg_file) print("Original model complexity:", cx_orig) print("Scaled model complexity:", cx_scaled)
def main(): # Parse cmd line args args = parse_args() # Load config options cfg.merge_from_file(args.cfg_file) cfg.merge_from_list(args.opts) assert_and_infer_cfg() cfg.freeze() # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config dump_cfg() # Perform training if cfg.NUM_GPUS > 1: mpu.multi_proc_run(num_proc=cfg.NUM_GPUS, fun=single_proc_train) else: single_proc_train()
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Using specific GPU # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) # os.environ['CUDA_VISIBLE_DEVICES'] = '0' # print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET/MODEL TYPE" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Save the config file in EXP_DIR dump_cfg(cfg) # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING TEST DATA ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print("\nDataset {} Loaded Sucessfully. Total Test Size: {}\n".format( cfg.DATASET.NAME, test_size)) logger.info("Dataset {} Loaded Sucessfully. Total Test Size: {}\n".format( cfg.DATASET.NAME, test_size)) # Preparing dataloaders for testing test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) print("======== TESTING ========\n") logger.info("======== TESTING ========\n") test_acc = test_model( test_loader, os.path.join(os.path.abspath('..'), cfg.TEST.MODEL_PATH), cfg) print("Test Accuracy: {}.\n".format(round(test_acc, 4))) logger.info("Test Accuracy {}.\n".format(test_acc)) print('Check the test accuracy inside {}/stdout.log'.format(cfg.EXP_DIR)) print("================================\n\n") logger.info("================================\n\n")
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Auto assign a RNG_SEED when not supplied a value if cfg.RNG_SEED is None: cfg.RNG_SEED = np.random.randint(100) # Using specific GPU # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) # os.environ['CUDA_VISIBLE_DEVICES'] = '0' # print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET/MODEL TYPE" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Save the config file in EXP_DIR dump_cfg(cfg) # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print( "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) logger.info( "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_L_RATIO, \ val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path cfg.ACTIVE_LEARNING.USET_PATH = uSet_path cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \ uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH) print( "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n" .format(len(lSet), len(uSet), len(valSet))) logger.info( "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format( len(lSet), len(uSet), len(valSet))) # Preparing dataloaders for initial training lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) # Initialize the model. model = model_builder.build_model(cfg) print("model: {}\n".format(cfg.MODEL.TYPE)) logger.info("model: {}\n".format(cfg.MODEL.TYPE)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, model) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) print("AL Query Method: {}\nMax AL Episodes: {}\n".format( cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER)) logger.info("AL Query Method: {}\nMax AL Episodes: {}\n".format( cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER)) for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1): print("======== EPISODE {} BEGINS ========\n".format(cur_episode)) logger.info( "======== EPISODE {} BEGINS ========\n".format(cur_episode)) # Creating output directory for the episode episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}') if not os.path.exists(episode_dir): os.mkdir(episode_dir) cfg.EPISODE_DIR = episode_dir # Train model print("======== TRAINING ========") logger.info("======== TRAINING ========") best_val_acc, best_val_epoch, checkpoint_file = train_model( lSet_loader, valSet_loader, model, optimizer, cfg) print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format( round(best_val_acc, 4), best_val_epoch)) logger.info( "EPISODE {} Best Validation Accuracy: {}\tBest Epoch: {}\n".format( cur_episode, round(best_val_acc, 4), best_val_epoch)) # Test best model checkpoint print("======== TESTING ========\n") logger.info("======== TESTING ========\n") test_acc = test_model(test_loader, checkpoint_file, cfg, cur_episode) print("Test Accuracy: {}.\n".format(round(test_acc, 4))) logger.info("EPISODE {} Test Accuracy {}.\n".format( cur_episode, test_acc)) # No need to perform active sampling in the last episode iteration if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER: # Save current lSet, uSet in the final episode directory data_obj.saveSet(lSet, 'lSet', cfg.EPISODE_DIR) data_obj.saveSet(uSet, 'uSet', cfg.EPISODE_DIR) break # Active Sample print("======== ACTIVE SAMPLING ========\n") logger.info("======== ACTIVE SAMPLING ========\n") al_obj = ActiveLearning(data_obj, cfg) clf_model = model_builder.build_model(cfg) clf_model = cu.load_checkpoint(checkpoint_file, clf_model) activeSet, new_uSet = al_obj.sample_from_uSet(clf_model, lSet, uSet, train_data) # Save current lSet, new_uSet and activeSet in the episode directory data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR) # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode lSet = np.append(lSet, activeSet) uSet = new_uSet lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) uSet_loader = data_obj.getSequentialDataLoader( indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) print( "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) logger.info( "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) print("================================\n\n") logger.info("================================\n\n")
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY} if use_cuda else {} # Auto assign a RNG_SEED when not supplied a value if cfg.RNG_SEED is None: cfg.RNG_SEED = np.random.randint(100) # Using specific GPU # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) # os.environ['CUDA_VISIBLE_DEVICES'] = '0' # print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET/MODEL TYPE" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print("Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n".format(exp_dir)) cfg.EXP_DIR = exp_dir # Save the config file in EXP_DIR dump_cfg(cfg) # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print("\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n".format(cfg.DATASET.NAME, train_size, test_size)) logger.info("Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n".format(cfg.DATASET.NAME, train_size, test_size)) trainSet_path, valSet_path = data_obj.makeTVSets(val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) trainSet, valSet = data_obj.loadTVPartitions(trainSetPath=trainSet_path, valSetPath=valSet_path) print("Data Partitioning Complete. \nTrain Set: {}, Validation Set: {}\n".format(len(trainSet), len(valSet))) logger.info("\nTrain Set: {}, Validation Set: {}\n".format(len(trainSet), len(valSet))) # Preparing dataloaders for initial training trainSet_loader = data_obj.getIndexesDataLoader(indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader(indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) # Initialize the models num_ensembles = cfg.ENSEMBLE.NUM_MODELS models = [] for i in range(num_ensembles): models.append(model_builder.build_model(cfg)) print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) logger.info("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) # This is to seamlessly use the code originally written for AL episodes cfg.EPISODE_DIR = cfg.EXP_DIR # Train models print("======== ENSEMBLE TRAINING ========") logger.info("======== ENSEMBLE TRAINING ========") best_model_paths = [] test_accs = [] for i in range(num_ensembles): print("=== Training ensemble [{}/{}] ===".format(i+1, num_ensembles)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, models[i]) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) # Each ensemble gets its own output directory cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR, 'model_{} '.format(i+1)) # Train the model best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model(trainSet_loader, valSet_loader, models[i], optimizer, cfg) best_model_paths.append(checkpoint_file) print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n".format(i+1, round(best_val_acc, 4), best_val_epoch)) logger.info("Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n".format(i+1, round(best_val_acc, 4), best_val_epoch)) # Test the model print("=== Testing ensemble [{}/{}] ===".format(i+1, num_ensembles)) test_acc = ensemble_test_model(test_loader, checkpoint_file, cfg, cur_episode=0) test_accs.append(test_acc) print("Test Accuracy by Model {}: {}.\n".format(i+1, round(test_acc, 4))) logger.info("Test Accuracy by Model {}: {}.\n".format(i+1, test_acc)) # Reset EPISODE_DIR cfg.EPISODE_DIR = cfg.EXP_DIR # Test each best model checkpoint and report the average print("======== ENSEMBLE TESTING ========\n") logger.info("======== ENSEMBLE TESTING ========\n") mean_test_acc = np.mean(test_accs) print("Average Ensemble Test Accuracy: {}.\n".format(round(mean_test_acc, 4))) logger.info("Average Ensemble Test Accuracy: {}.\n".format(mean_test_acc)) print("================================\n\n") logger.info("================================\n\n")