def parse_args(): parser = argparse.ArgumentParser('Get Test Result of Hateful Memes') parser.add_argument('--cfg', type=str, help='path to answer net config yaml') parser.add_argument('--ckpt', type=str, help='path to checkpoint of answer net') parser.add_argument('--bs', type=int) parser.add_argument('--gpus', type=int, nargs='+') parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') parser.add_argument('--result-path', type=str, help='path to store test result file.') parser.add_argument('--result-name', type=str) parser.add_argument('--split', default='test') args = parser.parse_args() if args.cfg is not None: update_config(args.cfg) if args.bs is not None: config.TEST.BATCH_IMAGES = args.bs if args.gpus is not None: config.GPUS = ','.join([str(gpu) for gpu in args.gpus]) if args.split is not None: config.DATASET.TEST_IMAGE_SET = args.split if args.model_dir is not None: config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) return args, config
def tune_vl_bert(config_path, pl_ckpt_path, num_samples=10, num_epochs=10, gpus_per_trial=2): # scheduler = ASHAScheduler( # metric="loss", # mode="min", # max_t=num_epochs, # grace_period=1, # reduction_factor=2) reporter = CLIReporter( parameter_columns=[ "lr", "weight_decay", "warmup_factor", "max_epoch", "batch_size" ], metric_columns=["mean_accuracy", "training_iteration"]) param_config = { "lr": 6.25e-7, "weight_decay": tune.loguniform(1e-5, 1e-2), "batch_size": 4, "max_epoch": tune.choice([4, 6, 8, 10]), "warmup_factor": tune.uniform(0, 1), "warmup_steps": tune.uniform(100, 800), } scheduler = PopulationBasedTraining(time_attr="training_iteration", metric="mean_accuracy", mode="max", perturbation_interval=2, hyperparam_mutations={ "lr": tune.loguniform(6.25e-6, 6.25e-8), "batch_size": [1, 2, 3, 4], }) update_config(config_path) model_base_cfg = copy.deepcopy(config) tune.run(partial( _tune, vl_bert_config=model_base_cfg, pl_ckpt_path=pl_ckpt_path, num_gpus=gpus_per_trial, ), resources_per_trial={ "cpu": 4, "gpu": gpus_per_trial, }, config=param_config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_vl_bert")
def parse_args(): parser = argparse.ArgumentParser('Train Cognition Network') parser.add_argument('--cfg', type=str, help='path to config file') parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') parser.add_argument('--log-dir', type=str, help='tensorboard log dir') parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') parser.add_argument('--do-test', help='whether to generate csv result on test set', default=False, action='store_true') parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true') # easy test pretrain model parser.add_argument('--partial-pretrain', type=str) parser.add_argument('--k-fold-i', type=int) args = parser.parse_args() if args.cfg is not None: update_config(args.cfg) if args.model_dir is not None: config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) if args.partial_pretrain is not None: config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain if args.slurm: proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() addr = subprocess.getoutput( 'scontrol show hostname {} | head -n1'.format(node_list)) os.environ['MASTER_PORT'] = str(29500) os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(ntasks) os.environ['RANK'] = str(proc_id) os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) return args, config
def train(config_path, pl_ckpt_path, **kwargs): with logger.catch(reraise=True): update_config(config_path) cfg = copy.deepcopy(config) _train(cfg, pl_ckpt_path, **kwargs)