def main(): parser = argparse.ArgumentParser() parser.add_argument("model", nargs="+") parser.add_argument("--nocache", action="store_true") parser.add_argument("--n_workers", type=int, default=12) parser.add_argument("--test", action="store_true") args = parser.parse_args() py_utils.add_stdout_logger() if args.test: ds = ImageNetAnimals10k("test", None) else: ds = ImageNetAnimals10k("dev", None) evaluator = BiasThresholdEvaluator([0.05, 0.1, 0.15, 0.2, 0.3, 0.8, 1.0]) evaluator_name = "text-thresh-eval-v2" evaluator.preprocess([ds]) models = extract_models(args.model) if sum(len(runs) for model_dir, runs in models.values()) == 0: print("No models selected") return get_evaluation(models, args.nocache, ds, evaluator, evaluator_name, 128, sort=False, n_workers=args.n_workers)
def main(): parser = argparse.ArgumentParser() parser.add_argument("output_dir", nargs="+") parser.add_argument("--n_processes", type=int, default=4) parser.add_argument("--fp16", action="store_true") parser.add_argument("--seed", type=int) parser.add_argument("--nruns", type=int, default=1) args = parser.parse_args() py_utils.add_stdout_logger() for src in args.output_dir: output_dir = src for i in range(args.nruns): trainer: Trainer = load_config.load_config( join(output_dir, "trainer.json")) subdir = train_utils.select_subdir(src) logging.info(f"Starting run for directory {subdir}") model: Model = load_config.load_config( join(output_dir, "model.json")) trainer.training_run(model, subdir, args.seed, args.n_processes, fp16=args.fp16)
def main(): # Re-use if we are evaluating multiple models load_word_vectors.GLOBAL_CACHE = {} parser = argparse.ArgumentParser() parser.add_argument("output_dir", nargs="+") parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--dataset", choices=["hans", "dev", "test"], default="dev") args = parser.parse_args() py_utils.add_stdout_logger() if args.dataset == "test": ds = MnliDevUnmatched() elif args.dataset == "dev": ds = MnliDevMatched() else: ds = Hans() ds.cache = True models = extract_models(args.output_dir) if len(models) == 0: print("No models found") return if args.dataset == "hans": evaluator = ClfEnsembleEvaluator(output_format="{output}-{metric}") evaluator_name = "ensemble-eval-v1" else: evaluator = ClfHardEasyEvaluator(prefix_format="{output}-{metric}/{split}") evaluator_name = "hard-easy-eval-v1" models, all_stats = get_cached_evaluations(models, ds, evaluator_name) if len(models) == 0: print("All models were cached") return elif len(all_stats) == 0: logging.info("No models were cached") else: logging.info(f"{len(all_stats)} models ({sum(len(x) for x in all_stats.items())}) were cached") evaluator.preprocess([ds]) for name, (model_dir, runs) in models.items(): logging.info(f"Evaluating model: {name} ({len(runs)} runs)") for run in runs: run_evaluation(run, ds, args.batch_size, evaluator, evaluator_name, cache=True, cache_model_output=True, n_processes=4)
def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Directory of models of evaluate") parser.add_argument( "--dataset", choices=list(datasets_fns.keys()), help="Dataset to evaluate on, if not set if will be inferred" " based on the what the models were trained on") parser.add_argument("--nocache", action="store_true") parser.add_argument("--test", action="store_true", help="Evaluated on the test data") args = parser.parse_args() is_test = args.test py_utils.add_stdout_logger() models = py_utils.extract_models(args.model) if len(models) == 0: logging.info("No models found") return 0 if args.dataset is not None: fn = datasets_fns[args.dataset] else: all_train_ds = None for model_dir, _ in models.values(): trainer = json.load(open(join(model_dir, "trainer.json"))) train_ds = trainer["train_dataset"]["name"] if all_train_ds is None: all_train_ds = train_ds elif all_train_ds != train_ds: raise ValueError("No dataset given, and unable to infer seems " "models were trained on different datasets") logging.info( f"All models were trained on {all_train_ds}, so testing on the same bias" ) if all_train_ds == "MNISTPatches": fn = get_patch elif all_train_ds == "MNISTDependent": fn = get_split elif all_train_ds == "MNISTBackgroundColor": fn = get_background else: raise ValueError() id_test, ood_test = fn(is_test, True), fn(is_test, False) evaluator = ClfEnsembleEvaluator() logging.info("Evaluating OOD Test") get_evaluation(models, args.nocache, ood_test, evaluator, "ensemble-eval-v1", 128, sort=False, progress_bar=False) logging.info("Evaluating ID Test") get_evaluation(models, args.nocache, id_test, evaluator, "ensemble-eval-v1", 128, sort=False, progress_bar=False)
def main(): py_utils.add_stdout_logger() parser = argparse.ArgumentParser() parser.add_argument("--dataset", choices=["patch", "split", "background"], required=True, help="Bias to train on") add_train_args(parser, entropy_w=False, default_adv_penalty=None, default_batch_size=1024, default_epochs=100, default_entropy_penalty=None, lc_weight_default=None) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--nruns", type=int, default=1) args = parser.parse_args() dataset = args.dataset if dataset == "patch": ds = MNISTPatches n_classes = 10 w = 30 elif dataset == "background": ds = MNISTBackgroundColor w = 28 n_classes = 10 elif dataset == "split": ds = MNISTDependent n_classes = 4 w = 30 else: raise NotImplementedError(f"Unknown dataset {dataset}") p = 0.9 n_per_class = 200 train = ds(p, True, (0, n_per_class)) opt = SGD(args.lr, momentum=0.9) eval_sets = [ EvalDataset(ds(p, True, (1400, 2400)), TorchDataIterator(SubsetSampler(None, args.batch_size)), "id"), EvalDataset(ds(1. / n_classes, True, (1400, 2400)), TorchDataIterator(SubsetSampler(None, args.batch_size)), "od"), ] train.cache = True for ds in eval_sets: ds.cache = True def build_model(): hc = get_high_capacity_model(w, n_classes) if args.mode == "none": # An ensemble with a Null predictor predictor = ClfArgminEnsemble( [ ClfHead(predictor=NullMNISTPredictor(n_classes), head_name="bias"), ClfHead(predictor=hc, head_name="debiased") ], n_classes, ) elif args.mode == "adv": if args.adversary_loss is None: if dataset == "patch": adv_loss = 0.01 elif dataset == "background": adv_loss = 0.08 elif dataset == "split": adv_loss = 0.01 else: raise RuntimeError() else: adv_loss = args.adversary_loss if args.lc_weight is None: # Default depends on the bias if dataset == "patch": lc_w = 0.7 elif dataset == "background": lc_w = 0.05 elif dataset == "split": lc_w = 0.02 else: raise RuntimeError() else: lc_w = args.lc_weight predictor = ClfBiAdversary(hc, get_low_capacity_model(w, n_classes), n_classes, adv_w=adv_loss, bias_loss=lc_w, main_loss=0.0, joint_loss=1.0, use_y_values=True, joint_adv=False) elif args.mode == "oracle": # An ensemble with a gold bias-predictor bias = FromBiasFeaturePredictor(p, n_classes) predictor = ClfArgminEnsemble( [ ClfHead(predictor=bias, head_name="bias"), ClfHead(predictor=hc, head_name="debiased") ], n_classes, ) else: if args.mode.startswith("mce"): rescaler = lambda: ArgminTransformFunction( AffineNLL( n_classes, n_classes, NumpyOptimizer(), residual=True, penalty=L2NormPenalty(0.002), fix_last_bias_to_zero=True, )) elif args.mode == "noci": rescaler = lambda: None elif args.mode == "nobp": rescaler = lambda: ArgminTransformFunction(AffineNLL( n_classes, n_classes, NumpyOptimizer(), residual=True, penalty=L2NormPenalty(0.002), fix_last_bias_to_zero=True, ), backprop_argmin= False) else: raise ValueError("Unknown mode: " + args.mode) predictor = ClfArgminEnsemble([ ClfHead( predictor=get_low_capacity_model(w, n_classes), head_name="bias", rescaler=rescaler(), nll_penalty=0.2 if args.lc_weight is None else args.lc_weight, ), ClfHead( predictor=hc, head_name="debiased", rescaler=rescaler(), ) ], n_classes) return ImageClfModel(predictor) evaluator = ClfEnsembleEvaluator() if args.mode in {"mce", "nobp"}: hook = FitRescaleParameters(1024, None, sort=False) else: hook = None trainer = Trainer( opt, train, eval_sets, train_eval_iterator=TorchDataIterator( SubsetSampler(None, args.batch_size)), train_iterator=TorchDataIterator( StratifiedSampler(args.batch_size, n_repeat=10)), num_train_epochs=args.epochs, evaluator=evaluator, pre_eval_hook=hook, tb_factor=args.batch_size / 256, save_each_epoch=False, progress_bar=True, eval_progress_bar=False, epoch_progress_bar=False, early_stopping_criteria=StoppingPoint("train", "nll/joint", 3e-4, 3), log_to_tb=False, ) for r in range(args.nruns): if args.nruns > 1: print("") print("") print("*" * 10 + f" STARTING RUN {r+1}/{args.nruns} " + "*" * 10) # Build a model for each run to ensure it is fully reset model = build_model() if args.output_dir: if r == 0: train_utils.clear_if_nonempty(args.output_dir) train_utils.init_model_dir(args.output_dir, trainer, model) subdir = train_utils.select_subdir(args.output_dir) else: subdir = None if args.init_only: return if subdir is not None: logging.info(f"Start run for {subdir}") if args.time: t0 = perf_counter() else: t0 = None try: if subdir is not None: with open(join(subdir, "console.out"), "w") as f: trainer.training_run(model, subdir, no_cuda=True, print_out=f) else: trainer.training_run(model, subdir, no_cuda=True) except Exception as e: if args.nruns == 1 or isinstance(e, KeyboardInterrupt): raise e logging.warning("Error during training: " + str(e)) continue if args.time: logging.info(f"Training took {perf_counter() - t0:.3f} seconds")
def main(): parser = argparse.ArgumentParser() add_train_args(parser, default_entropy_penalty=0.1, default_adv_penalty=0.3, default_batch_size=256, default_epochs=3, lc_weight_default=None) args = parser.parse_args() lc_weight = args.lc_weight if lc_weight is None: # Default depends on the mode if args.mode == "adv": lc_weight = 0.3 else: lc_weight = 0.2 dbg = args.debug py_utils.add_stdout_logger() main_model = FromBertPredictor(FromPooled(FullyConnected(768, 3, None))) if args.mode in {"mce", "noci", "nobp", "adv"}: lc_model = decatt_bias(150 if dbg else 400, 200) elif args.mode == "oracle": lc_model = None elif args.mode == "none": lc_model = FromEmbeddingPredictor(NullClfPredictor(3)) else: raise NotImplementedError(args.mode) if args.mode == "adv": predictor = ClfBiAdversary(main_model, lc_model, 3, args.adversary_loss, joint_loss=1.0, bias_loss=lc_weight, use_y_values=True, joint_adv=False) elif args.mode == "oracle": predictor = ClfBiasMixinEnsemble(ExtractPooled(), 3, MnliHypothesisOnlyBias(), 768, args.entropy_penalty) else: if args.mode in {"mce", "nobp"}: rescaler = lambda: ArgminTransformFunction(AffineNLL( 3, 3, NumpyOptimizer(), residual=True, penalty=L2NormPenalty(0.002), fix_last_bias_to_zero=True, ), backprop_argmin=args. mode == "mce") elif args.mode in {"noci", "none"}: rescaler = lambda: None else: raise RuntimeError() predictor = ClfArgminEnsemble( [ ClfHead( lc_model, head_name="bias", rescaler=rescaler(), nll_penalty=lc_weight, ), ClfHead( main_model, head_name="debiased", rescaler=rescaler(), ) ], n_classes=3, add_prior=False, # prior is uniform no_rescale_on_first_step=True) bias_set = [ ParameterSet( "predictor", "(encoder\..*)|(predictor\.heads\.0\..*)|(predictor\.(bias|main_to_bias).*)", dict(lr=1e-3, e=1e-8, weight_decay=0.0), ConstantLearningRate()) ] enc = WordAndCharEncoder( "random" if dbg else "crawl-300d-2M", None, 24, layers.Conv1D(24, 100, 5), MaxPooling(), ) model = BertAndEmbedModel("bert-base-uncased", 128, NltkAndPunctTokenizer(), enc, predictor) opt = Adam( lr=5e-5, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, schedule=LinearTriangle(0.1), alternative_sets=bias_set + [ ParameterSet("no-weight-decay", ".*(\.bias|LayerNorm\.weight)$", dict(weight_decay=0.0)) ]) n_final_eval = 512 if dbg else 4096 dev = MnliDevMatched(512 if dbg else None) train = MnliTrain(4096 if dbg else None) evaluator = ClfHardEasyEvaluator(prefix_format="{output}-{metric}/{split}") batch_size = args.batch_size trainer = Trainer( opt, train, [ EvalDataset(dev, TorchDataIterator(SortedBatchSampler(batch_size)), "dev"), ], pre_eval_hook=FitRescaleParameters(256, n_final_eval), train_eval_iterator=TorchDataIterator( SubsetSampler(None if dbg else 10000, batch_size, True)), train_iterator=TorchDataIterator(StratifiedSampler(batch_size)), num_train_epochs=args.epochs, evaluator=evaluator, tb_factor=batch_size / 256., evals_to_print=[ "bias-acc/ind", "bias-acc/ood", "debiased-acc/ind", "debiased-acc/ood", "joint-acc/ind", "joint-acc/ood" ]) if args.init_only: train_utils.init_model_dir(args.output_dir, trainer, model) else: trainer.train(model, args.output_dir, args.seed, args.n_processes, fp16=args.fp16, no_cuda=args.nocuda)
def main(args=None, init_only=False): py_utils.add_stdout_logger() parser = argparse.ArgumentParser() add_train_args(parser, default_entropy_penalty=0.1, default_adv_penalty=0.005, default_epochs=38, default_batch_size=512, lc_weight_default=None) parser.add_argument("--n_workers", type=int, default=12, help="N workers to use when loading images") args = parser.parse_args(args) if args.lc_weight is None: if args.mode == "adv": lc_weight = 0.3 else: lc_weight = 0.2 else: lc_weight = args.lc_weight dbg = args.debug n_classes = 6 if args.mode in {"mce", "noci"}: if args.mode == "noci": rescaler = lambda: None else: rescaler = lambda: ArgminTransformFunction(AffineNLL( n_classes, n_classes, NumpyOptimizer(), residual=True, penalty=L2NormPenalty(0.002), fix_last_bias_to_zero=True, ), backprop_argmin=args.mode != "nobp") predictor = ClfArgminEnsemble([ ClfHead( get_low_capacity(), head_name="bias", rescaler=rescaler(), nll_penalty=lc_weight ), ClfHead( get_high_capacity(), head_name="debiased", rescaler=rescaler(), ) ], n_classes, add_prior=False) elif args.mode == "none": predictor = ClfArgminEnsemble([ ClfHead(NullResnetPredictor(6), head_name="bias", rescaler=None), ClfHead(get_high_capacity(), head_name="debiased", rescaler=None) ], n_classes, add_prior=False) elif args.mode == "oracle": predictor = ClfBiasMixinEnsemble( ExtractLastEmbeddings(), 6, ResnetOracleBias(), 512, args.entropy_penalty ) elif args.mode == "adv": predictor = ClfBiAdversary( get_high_capacity(), get_low_capacity(), n_classes, args.adversary_loss, joint_loss=1.0, bias_loss=lc_weight, use_y_values=True, joint_adv=False, ) else: raise RuntimeError() evaluator = ClfEnsembleEvaluator() model = ResNetModel( predictor, arch="resnet18", from_pretrained=False, resize=256, eval_transform=ImageNetEvalTransform(224, resize=False), train_transform=RandomCropTransform(224), ) opt = SGD(0.02, momentum=0.9, schedule=PiecewiseLinear([args.epochs - 5], 0.3)) num_workers = args.n_workers n_train_workers = num_workers test_batch_size = 512 train = ImageNetAnimals10k("train", 300 if args.debug else None) dev = ImageNetAnimals10k("dev", 150 if args.debug else None) eval_sets = [ EvalDataset( dev, TorchDataIterator( SubsetSampler(None if args.debug else 12000, test_batch_size), pin_memory=True, num_workers=num_workers), "dev"), ] trainer = Trainer( opt, train, eval_sets, train_eval_iterator=TorchDataIterator(SubsetSampler(None if args.debug else 8000, args.batch_size), num_workers=n_train_workers, pin_memory=True), train_iterator=TorchDataIterator( StratifiedSampler(test_batch_size, n_repeat=2), pin_memory=True, num_workers=n_train_workers), num_train_epochs=3 if dbg else args.epochs, evaluator=evaluator, tb_factor=args.batch_size/256, pre_eval_hook=FitRescaleParameters(test_batch_size, None if args.debug else 4096, sort=False), save_best_model=("dev", "acc/joint"), eval_on_epochs=2, split_then_collate=True ) if init_only or args.init_only: train_utils.init_model_dir(args.output_dir, trainer, model) else: trainer.train(model, args.output_dir, args.seed, args.n_processes, fp16=args.fp16, no_cuda=args.nocuda)