def main(args): if args.wandb: import wandb wandb.init() logdir = args.logdir + "/" + wandb.run.name else: logdir = args.logdir set_global_seed(args.seed) datasets = load_dataset(args.dataset) tokenizer = AutoTokenizer.from_pretrained(args.model) datasets = datasets.map( lambda e: tokenizer( e["text"], truncation=True, padding="max_length", max_length=128), batched=True, ) datasets = datasets.map(lambda e: {"labels": e["label"]}, batched=True) datasets.set_format( type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"], ) loaders = { "train": DataLoader(datasets["train"], batch_size=args.batch_size, shuffle=True), "valid": DataLoader(datasets["test"], batch_size=args.batch_size), } metric_callback = LoaderMetricCallback( metric=HFMetric(metric=load_metric("accuracy")), input_key="logits", target_key="labels", ) teacher_model = AutoModelForSequenceClassification.from_pretrained( args.model, num_labels=args.num_labels) callbacks = [metric_callback, OptimizerCallback(metric_key="loss")] runner = HFRunner() runner.train(model=teacher_model, loaders=loaders, optimizer=torch.optim.Adam(teacher_model.parameters(), lr=args.lr), callbacks=callbacks, num_epochs=args.num_epochs, valid_metric="accuracy", minimize_valid_metric=False, logdir=logdir, valid_loader="valid", verbose=args.verbose, seed=args.seed) if args.wandb: import csv with open(logdir + "/valid.csv") as fi: reader = csv.DictReader(fi) accuracy = [] for row in reader: if row["accuracy"] == "accuracy": continue accuracy.append(float(row["accuracy"])) wandb.log({"accuracy": max(accuracy[-args.num_epochs:])})
def get_callbacks(self, stage: str): return { "criterion": CriterionCallback(metric_key="loss", input_key="logits", target_key="targets"), "optimizer": OptimizerCallback(metric_key="loss"), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "checkpoint": CheckpointCallback(self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3), "test_nn_module": ModuleTypeChecker(), "test_device": DeviceCheckCallback(self._device, logger=logger), "test_loss_minimization": LossMinimizationCallback("loss", logger=logger), "test_logits_type": TensorTypeChecker("logits"), # "loss_type_checker": TensorTypeChecker("loss", True), }
def test_pruning(): from catalyst.callbacks import ( AccuracyCallback, ControlFlowCallback, CriterionCallback, OptimizerCallback, PruningCallback, ) from catalyst.contrib.datasets import MNIST import torch from torch.utils.data import DataLoader from torchvision.transforms import ToTensor from compressors.distillation.callbacks import KLDivCallback, MetricAggregationCallback from compressors.models import MLP from compressors.pruning.runners import PruneRunner from compressors.utils.data import TorchvisionDatasetWrapper as Wrp model = MLP(num_layers=3) datasets = { "train": Wrp(MNIST("./data", train=True, download=True, transform=ToTensor())), "valid": Wrp(MNIST("./data", train=False, transform=ToTensor())), } loaders = { dl_key: DataLoader(dataset, shuffle=dl_key == "train", batch_size=32) for dl_key, dataset in datasets.items() } optimizer = torch.optim.Adam(model.parameters()) runner = PruneRunner(num_sessions=10) runner.train(model=model, loaders=loaders, optimizer=optimizer, criterion=torch.nn.CrossEntropyLoss(), callbacks=[ PruningCallback( pruning_fn="l1_unstructured", amount=0.2, remove_reparametrization_on_stage_end=False, ), OptimizerCallback(metric_key="loss"), CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), AccuracyCallback(input_key="logits", target_key="targets"), ], logdir="./pruned_model", valid_loader="valid", valid_metric="accuracy", minimize_valid_metric=False, check=True)
def test_distil(): from itertools import chain from catalyst.callbacks import AccuracyCallback, OptimizerCallback from catalyst.contrib.datasets import MNIST import torch from torch.utils.data import DataLoader from torchvision import transforms as T from compressors.distillation.runners import EndToEndDistilRunner from compressors.models import MLP from compressors.utils.data import TorchvisionDatasetWrapper as Wrp teacher = MLP(num_layers=4) student = MLP(num_layers=3) datasets = { "train": Wrp(MNIST("./data", train=True, download=True, transform=T.ToTensor())), "valid": Wrp(MNIST("./data", train=False, transform=T.ToTensor())), } loaders = { dl_key: DataLoader(dataset, shuffle=dl_key == "train", batch_size=32) for dl_key, dataset in datasets.items() } optimizer = torch.optim.Adam( chain(teacher.parameters(), student.parameters())) runner = EndToEndDistilRunner(hidden_state_loss="mse", num_train_teacher_epochs=5) runner.train( model=torch.nn.ModuleDict({ "teacher": teacher, "student": student }), loaders=loaders, optimizer=optimizer, num_epochs=4, callbacks=[ OptimizerCallback(metric_key="loss"), AccuracyCallback(input_key="logits", target_key="targets"), ], valid_metric="accuracy01", minimize_valid_metric=False, logdir="./logs", valid_loader="valid", criterion=torch.nn.CrossEntropyLoss(), check=True, )
def get_callbacks(self, stage: str): return { "criterion": CriterionCallback( metric_key="loss", input_key="logits", target_key="targets" ), "optimizer": OptimizerCallback(metric_key="loss"), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "checkpoint": CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3 ), "test_nn_parallel_data_parallel": DataParallelTypeChecker(), "test_loss_minimization": LossMinimizationCallback("loss", logger=logger), "test_logits_type": OPTTensorTypeChecker("logits", self._opt_level), }
def get_callbacks(self, stage: str): return { "criterion": CriterionCallback( metric_key="loss", input_key="logits", target_key="targets" ), "optimizer": OptimizerCallback(metric_key="loss"), # "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss"), "checkpoint": CheckpointCallback( self._logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3 ), "test_nn_parallel_distributed_data_parallel": DistributedDataParallelTypeChecker(), "test_loss_minimization": LossMinimizationCallback("loss", logger=logger), "test_world_size": WorldSizeCheckCallback(NUM_CUDA_DEVICES, logger=logger), }
def test_fast_zero_grad(): model = nn.Linear(10, 2) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) criterion = nn.BCEWithLogitsLoss() batch_size = 3 inp = torch.randn(batch_size, 10) target = torch.FloatTensor(batch_size, 2).uniform_() callback = OptimizerCallback(metric_key="loss", use_fast_zero_grad=True) loss1 = criterion(model(inp), target) loss1_value = loss1.detach().item() runner = DummyRunner(loss1, optimizer) callback.on_stage_start(runner) callback.on_epoch_start(runner) callback.on_batch_end(runner) loss2 = criterion(model(inp), target) loss2_value = loss2.detach().item() runner.batch_metrics = {"loss": loss2} callback.on_epoch_start(runner) callback.on_batch_end(runner) assert loss1_value > loss2_value
def test_tracer_callback(): """ Tests a feature of `TracingCallback` for model tracing during training """ logdir = "./logs" dataset_root = "./data" loaders = _get_loaders(root=dataset_root, batch_size=4, num_workers=1) images, targets = next(iter(loaders["train"])) _, c, h, w = images.shape input_shape = (c, h, w) model = _TracedNet(input_shape) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters()) method_name = "forward" mode = "eval" requires_grad = False checkpoint_name = "best" opt_level = None trace_name = get_trace_name( method_name=method_name, mode=mode, requires_grad=requires_grad, additional_string=checkpoint_name, ) tracing_path = Path(logdir) / "trace" / trace_name criterion_callback = CriterionCallback() optimizer_callback = OptimizerCallback() tracer_callback = TracingCallback( metric="loss", minimize=False, trace_mode=mode, mode=checkpoint_name, do_once=True, method_name=method_name, requires_grad=requires_grad, opt_level=opt_level, ) test_callback = _OnStageEndCheckModelTracedCallback( path=tracing_path, inputs=images, ) callbacks = collections.OrderedDict( loss=criterion_callback, optimizer=optimizer_callback, tracer_callback=tracer_callback, test_callback=test_callback, ) runner = SupervisedRunner(input_key="x") runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, callbacks=callbacks, check=True, verbose=True, ) shutil.rmtree(logdir)
def main(args): set_global_seed(42) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) datasets = { "train": Wrp( CIFAR100(root=".", train=True, download=True, transform=transform_train)), "valid": Wrp(CIFAR100(root=".", train=False, transform=transform_test)), } loaders = { k: DataLoader(v, batch_size=args.batch_size, shuffle=k == "train", num_workers=2) for k, v in datasets.items() } teacher_model = NAME2MODEL[args.teacher](num_classes=100) if args.teacher_path is None: teacher_sd = load_state_dict_from_url(NAME2URL[args.teacher]) teacher_model.load_state_dict(teacher_sd) else: unpack_checkpoint(torch.load(args.teacher_path), model=teacher_model) student_model = NAME2MODEL[args.student](num_classes=100) optimizer = torch.optim.SGD(student_model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [150, 180, 210], gamma=0.1) runner = DistilRunner(apply_probability_shift=args.probability_shift) runner.train(model={ "teacher": teacher_model, "student": student_model }, loaders=loaders, optimizer=optimizer, scheduler=scheduler, valid_metric="accuracy", minimize_valid_metric=False, logdir=args.logdir, callbacks=[ ControlFlowCallback(AttentionHiddenStatesCallback(), loaders="train"), ControlFlowCallback(KLDivCallback(temperature=4), loaders="train"), CriterionCallback(input_key="s_logits", target_key="targets", metric_key="cls_loss"), ControlFlowCallback( MetricAggregationCallback( prefix="loss", metrics={ "attention_loss": args.beta, "kl_div_loss": args.alpha, "cls_loss": 1 - args.alpha, }, mode="weighted_sum", ), loaders="train", ), AccuracyCallback(input_key="s_logits", target_key="targets"), OptimizerCallback(metric_key="loss", model_key="student"), SchedulerCallback(), ], valid_loader="valid", num_epochs=args.num_epochs, criterion=torch.nn.CrossEntropyLoss(), seed=args.seed)
def test_hf(): """tests example pipeline""" datasets = load_dataset("ag_news") tokenizer = AutoTokenizer.from_pretrained( "google/bert_uncased_L-4_H-128_A-2") datasets = datasets.map( lambda e: tokenizer( e["text"], truncation=True, padding="max_length", max_length=128), batched=True, ) datasets = datasets.map(lambda e: {"labels": e["label"]}, batched=True) datasets.set_format( type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"], ) loaders = { "train": DataLoader(datasets["train"], batch_size=32, shuffle=True), "valid": DataLoader(datasets["test"], batch_size=32), } metric_callback = LoaderMetricCallback( metric=HFMetric(metric=load_metric("accuracy")), input_key="logits", target_key="labels", ) teacher_model = AutoModelForSequenceClassification.from_pretrained( "google/bert_uncased_L-4_H-128_A-2", num_labels=4) runner = HFRunner() runner.train( model=teacher_model, loaders=loaders, optimizer=torch.optim.Adam(teacher_model.parameters(), lr=1e-4), callbacks=[metric_callback], num_epochs=3, valid_metric="accuracy", minimize_valid_metric=False, check=True, ) metric_callback = LoaderMetricCallback( metric=HFMetric(metric=load_metric("accuracy")), input_key="s_logits", target_key="labels", ) slct_callback = ControlFlowCallback( HiddenStatesSelectCallback(hiddens_key="t_hidden_states", layers=[1, 3]), loaders="train", ) lambda_hiddens_callback = ControlFlowCallback( LambdaPreprocessCallback(lambda s_hiddens, t_hiddens: ( [c_s[:, 0] for c_s in s_hiddens], [t_s[:, 0] for t_s in t_hiddens], # tooks only CLS token )), loaders="train", ) mse_hiddens = ControlFlowCallback(MSEHiddenStatesCallback(), loaders="train") kl_div = ControlFlowCallback(KLDivCallback(), loaders="train") aggregator = ControlFlowCallback( MetricAggregationCallback( prefix="loss", metrics={ "kl_div_loss": 0.2, "mse_loss": 0.2, "task_loss": 0.6 }, mode="weighted_sum", ), loaders="train", ) runner = HFDistilRunner() student_model = AutoModelForSequenceClassification.from_pretrained( "google/bert_uncased_L-2_H-128_A-2", num_labels=4) runner.train( model=torch.nn.ModuleDict({ "teacher": teacher_model, "student": student_model }), loaders=loaders, optimizer=torch.optim.Adam(student_model.parameters(), lr=1e-4), callbacks=[ metric_callback, slct_callback, lambda_hiddens_callback, mse_hiddens, kl_div, aggregator, OptimizerCallback(metric_key="loss"), ], check=True, num_epochs=3, valid_metric="accuracy", minimize_valid_metric=False, valid_loader="valid", )
datasets = { "train": Wrp(MNIST("./data", train=True, download=True, transform=T.ToTensor())), "valid": Wrp(MNIST("./data", train=False, transform=T.ToTensor())), } loaders = { dl_key: DataLoader(dataset, shuffle=dl_key == "train", batch_size=32) for dl_key, dataset in datasets.items() } optimizer = torch.optim.Adam(chain(teacher.parameters(), student.parameters())) runner = EndToEndDistilRunner(hidden_state_loss="mse", num_train_teacher_epochs=5) runner.train( model=torch.nn.ModuleDict({"teacher": teacher, "student": student}), loaders=loaders, optimizer=optimizer, num_epochs=4, callbacks=[ OptimizerCallback(metric_key="loss"), AccuracyCallback(input_key="logits", target_key="targets"), ], valid_metric="accuracy01", minimize_valid_metric=False, logdir="./logs", valid_loader="valid", criterion=torch.nn.CrossEntropyLoss(), check=True, )
def main(args): if args.wandb: import wandb wandb.init() logdir = args.logdir + "/" + wandb.run.name else: logdir = args.logdir set_global_seed(args.seed) datasets = load_dataset(args.dataset) tokenizer = AutoTokenizer.from_pretrained(args.teacher_model) datasets = datasets.map( lambda e: tokenizer( e["text"], truncation=True, padding="max_length", max_length=128), batched=True, ) datasets = datasets.map(lambda e: {"labels": e["label"]}, batched=True) datasets.set_format( type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"], ) loaders = { "train": DataLoader(datasets["train"], batch_size=args.batch_size, shuffle=True), "valid": DataLoader(datasets["test"], batch_size=args.batch_size), } teacher_model = AutoModelForSequenceClassification.from_pretrained( args.teacher_model, num_labels=args.num_labels) unpack_checkpoint(torch.load(args.teacher_path), model=teacher_model) metric_callback = LoaderMetricCallback( metric=HFMetric(metric=load_metric("accuracy")), input_key="s_logits", target_key="labels", ) layers = [int(layer) for layer in args.layers.split(",")] slct_callback = ControlFlowCallback( HiddenStatesSelectCallback(hiddens_key="t_hidden_states", layers=layers), loaders="train", ) lambda_hiddens_callback = ControlFlowCallback( LambdaPreprocessCallback(lambda s_hiddens, t_hiddens: ( [c_s[:, 0] for c_s in s_hiddens], [t_s[:, 0] for t_s in t_hiddens], # tooks only CLS token )), loaders="train", ) mse_hiddens = ControlFlowCallback(MSEHiddenStatesCallback(), loaders="train") kl_div = ControlFlowCallback( KLDivCallback(temperature=args.kl_temperature), loaders="train") runner = HFDistilRunner() student_model = AutoModelForSequenceClassification.from_pretrained( args.student_model, num_labels=args.num_labels) callbacks = [ metric_callback, slct_callback, lambda_hiddens_callback, kl_div, OptimizerCallback(metric_key="loss"), CheckpointCallback(logdir=logdir, loader_key="valid", mode="model", metric_key="accuracy", minimize=False) ] if args.beta > 0: aggregator = ControlFlowCallback( MetricAggregationCallback( prefix="loss", metrics={ "kl_div_loss": args.alpha, "mse_loss": args.beta, "task_loss": 1 - args.alpha }, mode="weighted_sum", ), loaders="train", ) callbacks.append(mse_hiddens) callbacks.append(aggregator) else: aggregator = ControlFlowCallback( MetricAggregationCallback( prefix="loss", metrics={ "kl_div_loss": args.alpha, "task_loss": 1 - args.alpha }, mode="weighted_sum", ), loaders="train", ) callbacks.append(aggregator) runner.train(model=torch.nn.ModuleDict({ "teacher": teacher_model, "student": student_model }), loaders=loaders, optimizer=torch.optim.Adam(student_model.parameters(), lr=args.lr), callbacks=callbacks, num_epochs=args.num_epochs, valid_metric="accuracy", logdir=logdir, minimize_valid_metric=False, valid_loader="valid", verbose=args.verbose, seed=args.seed) if args.wandb: import csv import shutil with open(logdir + "/valid.csv") as fi: reader = csv.DictReader(fi) accuracy = [] for row in reader: if row["accuracy"] == "accuracy": continue accuracy.append(float(row["accuracy"])) wandb.log({"accuracy": max(accuracy[-args.num_epochs:])}) shutil.rmtree(logdir)
def main(args): set_global_seed(args.seed) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) datasets = { "train": Wrp( CIFAR100(root=".", train=True, download=True, transform=transform_train)), "valid": Wrp(CIFAR100(root=".", train=False, transform=transform_test)), } loaders = { k: DataLoader(v, batch_size=args.batch_size, shuffle=k == "train", num_workers=2) for k, v in datasets.items() } model = NAME2MODEL[args.model](num_classes=100) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [150, 180, 210], gamma=0.1) runner = SupervisedRunner() runner.train(model=model, loaders=loaders, optimizer=optimizer, scheduler=scheduler, valid_metric="accuracy", minimize_valid_metric=False, logdir=args.logdir, callbacks=[ CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"), AccuracyCallback(input_key="logits", target_key="targets"), OptimizerCallback(metric_key="loss"), SchedulerCallback(), ], valid_loader="valid", num_epochs=args.num_epochs, criterion=torch.nn.CrossEntropyLoss(), seed=args.seed)