def _test_score( metric: AccumulativeMetric, batch: Dict[str, torch.Tensor], true_values: Dict[str, float] ) -> None: """Check if given metric works correctly""" metric.reset(num_batches=1, num_samples=len(batch["embeddings"])) metric.update(**batch) values = metric.compute_key_value() for key in true_values: assert key in values assert values[key] == true_values[key]
def test_accumulation_reset(generate_batched_data): """Check if AccumulativeMetric accumulates all the data correctly with multiple resets""" for (fields_names, num_batches, num_samples, batches, true_values) in generate_batched_data: metric = AccumulativeMetric(keys=fields_names) for _ in range(5): metric.reset(num_batches=num_batches, num_samples=num_samples) for batch in batches: metric.update(**batch) for field_name in true_values: assert (true_values[field_name] == metric.storage[field_name]).all()
def test_accumulation(generate_batched_data) -> None: """ Check if AccumulativeMetric accumulates all the data correctly along one loader """ for (fields_names, num_batches, num_samples, batches, true_values) in generate_batched_data: metric = AccumulativeMetric(keys=fields_names) metric.reset(num_batches=num_batches, num_samples=num_samples) for batch in batches: metric.update(**batch) for field_name in true_values: assert (true_values[field_name] == metric.storage[field_name]).all()
def test_accumulation_dtype(): """Check if AccumulativeMetric accumulates all the data with correct types""" batch_size = 10 batch = { "field_int": torch.randint(low=0, high=5, size=(batch_size, 5)), "field_bool": torch.randint(low=0, high=2, size=(batch_size, 10), dtype=torch.bool), "field_float32": torch.rand(size=(batch_size, 4), dtype=torch.float32), } metric = AccumulativeMetric(keys=list(batch.keys())) metric.reset(num_samples=batch_size, num_batches=1) metric.update(**batch) for key in batch: assert (batch[key] == metric.storage[key]).all() assert batch[key].dtype == metric.storage[key].dtype
class SklearnModelCallback(Callback): """Callback to train a classifier on the train loader and to give predictions on the valid loader. Args: feature_key: keys of tensors that should be used as features in the classifier calculations target_key: keys of tensors that should be used as targets in the classifier calculations train_loader: train loader name valid_loaders: valid loaders where model should be predicted model_fn: fabric to produce objects with .fit and predict method predict_method: predict method name for the classifier predict_key: key to store computed classifier predicts in ``runner.batch`` dictionary model_kwargs: additional parameters for ``model_fn`` .. note:: catalyst[ml] required for this callback Examples: .. code-block:: python import os from sklearn.linear_model import LogisticRegression from torch.optim import Adam from torch.utils.data import DataLoader from catalyst import data, dl from catalyst.contrib import datasets, models, nn from catalyst.data.transforms import Compose, Normalize, ToTensor # 1. train and valid loaders transforms = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))]) train_dataset = datasets.MnistMLDataset( root=os.getcwd(), download=True, transform=transforms ) sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10) train_loader = DataLoader( dataset=train_dataset, sampler=sampler, batch_size=sampler.batch_size) valid_dataset = datasets.MNIST(root=os.getcwd(), transform=transforms, train=False) valid_loader = DataLoader(dataset=valid_dataset, batch_size=1024) # 2. model and optimizer model = models.MnistSimpleNet(out_features=16) optimizer = Adam(model.parameters(), lr=0.001) # 3. criterion with triplets sampling sampler_inbatch = data.HardTripletsSampler(norm_required=False) criterion = nn.TripletMarginLossWithSampler( margin=0.5, sampler_inbatch=sampler_inbatch ) # 4. training with catalyst Runner class CustomRunner(dl.SupervisedRunner): def handle_batch(self, batch) -> None: images, targets = batch["features"].float(), batch["targets"].long() features = self.model(images) self.batch = { "embeddings": features, "targets": targets, } callbacks = [ dl.ControlFlowCallback( dl.CriterionCallback( input_key="embeddings", target_key="targets", metric_key="loss"), loaders="train", ), dl.SklearnModelCallback( feature_key="embeddings", target_key="targets", train_loader="train", valid_loaders="valid", model_fn=LogisticRegression, predict_method="predict_proba", predict_key="sklearn_predict" ), dl.ControlFlowCallback( dl.AccuracyCallback( target_key="targets", input_key="sklearn_predict", topk_args=(1, 3) ), loaders="valid" ) ] runner = CustomRunner(input_key="features", output_key="embeddings") runner.train( model=model, criterion=criterion, optimizer=optimizer, callbacks=callbacks, loaders={"train": train_loader, "valid": valid_loader}, verbose=False, logdir="./logs", valid_loader="valid", valid_metric="accuracy", minimize_valid_metric=False, num_epochs=100, ) .. code-block:: python import os from torch.optim import Adam from torch.utils.data import DataLoader from catalyst import data, dl from catalyst.contrib import datasets, models, nn from catalyst.data.transforms import Compose, Normalize, ToTensor # 1. train and valid loaders transforms = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))]) train_dataset = datasets.MnistMLDataset( root=os.getcwd(), download=True, transform=transforms ) sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10) train_loader = DataLoader( dataset=train_dataset, sampler=sampler, batch_size=sampler.batch_size) valid_dataset = datasets.MNIST(root=os.getcwd(), transform=transforms, train=False) valid_loader = DataLoader(dataset=valid_dataset, batch_size=1024) # 2. model and optimizer model = models.MnistSimpleNet(out_features=16) optimizer = Adam(model.parameters(), lr=0.001) # 3. criterion with triplets sampling sampler_inbatch = data.HardTripletsSampler(norm_required=False) criterion = nn.TripletMarginLossWithSampler( margin=0.5, sampler_inbatch=sampler_inbatch ) # 4. training with catalyst Runner class CustomRunner(dl.SupervisedRunner): def handle_batch(self, batch) -> None: images, targets = batch["features"].float(), batch["targets"].long() features = self.model(images) self.batch = { "embeddings": features, "targets": targets, } callbacks = [ dl.ControlFlowCallback( dl.CriterionCallback( input_key="embeddings", target_key="targets", metric_key="loss"), loaders="train", ), dl.SklearnModelCallback( feature_key="embeddings", target_key="targets", train_loader="train", valid_loaders="valid", model_fn="linear_model.LogisticRegression", predict_method="predict_proba", predict_key="sklearn_predict" ), dl.ControlFlowCallback( dl.AccuracyCallback( target_key="targets", input_key="sklearn_predict", topk_args=(1, 3) ), loaders="valid" ) ] runner = CustomRunner(input_key="features", output_key="embeddings") runner.train( model=model, criterion=criterion, optimizer=optimizer, callbacks=callbacks, loaders={"train": train_loader, "valid": valid_loader}, verbose=False, logdir="./logs", valid_loader="valid", valid_metric="accuracy", minimize_valid_metric=False, num_epochs=100, ) """ def __init__( self, feature_key: str, target_key: Union[str, None], train_loader: str, valid_loaders: Union[str, List[str]], model_fn: Union[Callable, str], predict_method: str = "predict", predict_key: str = "sklearn_predict", **model_kwargs, ) -> None: super().__init__(order=CallbackOrder.Metric) if isinstance(model_fn, str): base, clf = model_fn.split(".") base = f"sklearn.{base}" model_fn = getattr(importlib.import_module(base), clf) assert hasattr(model_fn(), predict_method), "The classifier must have the predict method!" self._train_loader = train_loader if isinstance(valid_loaders, str): self._valid_loaders = [valid_loaders] else: self._valid_loaders = valid_loaders self.model_fabric_fn = partial(model_fn, **model_kwargs) self.feature_key = feature_key self.target_key = target_key self.predict_method = predict_method self.predict_key = predict_key self.model = None if self.target_key: self.storage = AccumulativeMetric(keys=[feature_key, target_key]) if self.target_key is None: self.storage = AccumulativeMetric(keys=[feature_key]) def on_loader_start(self, runner: "IRunner") -> None: """ Loader start hook: initiliaze storages for the loaders. Args: runner: current runner """ super().on_loader_start(runner) if runner.loader_key == self._train_loader: self.storage.reset( num_samples=runner.loader_sample_len, num_batches=runner.loader_batch_len, ) if runner.loader_key in self._valid_loaders: assert self.model is not None, "The train loader has to be processed first!" def on_batch_end(self, runner: "IRunner") -> None: """On batch end action: get data from runner's batch and update a loader storage with it Args: runner: runner for the experiment. """ if runner.loader_key == self._train_loader: self.storage.update(**runner.batch) if runner.loader_key in self._valid_loaders: features = runner.batch[self.feature_key].detach().cpu().numpy() # classifier predict classifier_predict = getattr(self.model, self.predict_method) predictions = classifier_predict(features) runner.batch[self.predict_key] = torch.tensor(predictions, device=runner.engine.device) def on_loader_end(self, runner: "IRunner") -> None: """ Loader end hook: for the train loader train classifier/for the test check the quality Args: runner: current runner """ if runner.loader_key == self._train_loader: data = self.storage.compute_key_value() # model fit # pdb.set_trace() self.model = self.model_fabric_fn() if self.target_key is None: features = data[self.feature_key].detach().cpu().numpy() self.model.fit(features) else: features = data[self.feature_key].detach().cpu().numpy() targets = data[self.target_key].detach().cpu().numpy() self.model.fit(features, targets) def on_epoch_end(self, runner: "IRunner") -> None: """ Epoch end hook: the callback delete the model. Args: runner: current runner """ # We need this for the control of a loader order. self.model = None