def test_av_load_non_saved_layer(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: model_id = "dummy" with self.assertRaises(RuntimeError) as context: AV.load(tmpdir, model_id) self.assertTrue( (f"Activation vectors for model {model_id} " f"was not found at path {tmpdir}") == str(context.exception))
def test_av_load_all_layers_one_identifier(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: av_01 = torch.randn(36, 16) av_02 = torch.randn(16, 16) av_03 = torch.randn(4, 16) avs_0 = [av_01, av_02, av_03] av_11 = torch.randn(36, 16) av_12 = torch.randn(16, 16) av_13 = torch.randn(4, 16) avs_1 = [av_11, av_12, av_13] idf1, idf2 = "idf1", "idf2" AV.save( tmpdir, "dummy", idf1, ["layer1.0.conv1", "layer1.0.conv2", "layer1.1.conv1"], avs_0, "0", ) dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy"))) self.assertEqual(len(dataloader), 3) AV.save( tmpdir, "dummy", idf2, ["layer1.0.conv1", "layer1.0.conv2", "layer1.1.conv1"], avs_1, "0", ) dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy"))) self.assertEqual(len(dataloader), 6) # check activations for idf1 dataloader_layer = DataLoader( cast(Dataset, AV.load(tmpdir, "dummy", identifier=idf1))) self.assertEqual(len(dataloader_layer), 3) for i, av in enumerate(dataloader_layer): assertTensorAlmostEqual(self, av, avs_0[i].unsqueeze(0)) # check activations for idf2 dataloader_layer = DataLoader( cast(Dataset, AV.load(tmpdir, "dummy", identifier=idf2))) self.assertEqual(len(dataloader_layer), 3) for i, av in enumerate(dataloader_layer): assertTensorAlmostEqual(self, av, avs_1[i].unsqueeze(0))
def test_TCAV_generate_all_activations(self) -> None: def forward_hook_wrapper(expected_act: Tensor): def forward_hook(module, inp, out=None): out = torch.reshape(out, (out.shape[0], -1)) self.assertEqual(out.detach().shape[1:], expected_act.shape[1:]) return forward_hook with tempfile.TemporaryDirectory() as tmpdirname: layers = ["conv1", "conv2", "fc1", "fc2"] tcav, concept_dict = init_TCAV(tmpdirname, CustomClassifier(), layers=layers) tcav.concepts = set(concept_dict.values()) # generating all activations for given layers and concepts tcav.generate_all_activations() # verify that all activations exist and have correct shapes for layer in layers: for _, concept in concept_dict.items(): self.assertTrue( AV.exists(tmpdirname, "default_model_id", concept.identifier, layer)) concept_meta: Dict[int, int] = defaultdict(int) for _, concept in concept_dict.items(): activations = AV.load(tmpdirname, "default_model_id", concept.identifier, layer) def batch_collate(batch): return torch.cat(batch) self.assertTrue(concept.data_iter is not None) assert not (activations is None) for activation in cast( Iterable, DataLoader(activations, collate_fn=batch_collate)): concept_meta[concept.id] += activation.shape[0] layer_module = _get_module_from_name(tcav.model, layer) for data in cast(Iterable, concept.data_iter): hook = layer_module.register_forward_hook( forward_hook_wrapper(activation)) tcav.model(data) hook.remove() # asserting the length of entire dataset for each concept for concept_meta_i in concept_meta.values(): self.assertEqual(concept_meta_i, 100)
def test_av_load_one_batch(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: av_0 = torch.randn(64, 16) av_1 = torch.randn(36, 16) avs = [av_0, av_1] # add av_0 to the list of activations model_id = "dummy" with self.assertRaises(RuntimeError) as context: AV.load(tmpdir, model_id) self.assertTrue( (f"Activation vectors for model {model_id} " f"was not found at path {tmpdir}") == str(context.exception)) AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1", av_0, "0") model_id = "dummy" dataset = AV.load(tmpdir, model_id, identifier=DEFAULT_IDENTIFIER) for i, av in enumerate(DataLoader(cast(Dataset, dataset))): assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0)) # add av_1 to the list of activations dataloader_2 = DataLoader( cast( Dataset, AV.load(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv2"), )) self.assertEqual(len(dataloader_2), 0) AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv2", av_1, "0") dataset = AV.load(tmpdir, "dummy", identifier=DEFAULT_IDENTIFIER) dataloader = DataLoader(cast(Dataset, dataset)) self.assertEqual(len(dataloader), 2) for i, av in enumerate(dataloader): assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0))
def test_av_load_all_identifiers_one_layer(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: av_0 = torch.randn(64, 16) av_1 = torch.randn(36, 16) av_2 = torch.randn(16, 16) av_3 = torch.randn(4, 16) avs = [av_1, av_2, av_3] idf1, idf2, idf3 = "idf1", "idf2", "idf3" AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1", av_0, "0") dataloader = DataLoader( cast(Dataset, AV.load(tmpdir, "dummy", identifier=DEFAULT_IDENTIFIER))) self.assertEqual(len(dataloader), 1) # add activations for another layer AV.save(tmpdir, "dummy", idf1, "layer1.0.conv2", av_1, "0") AV.save(tmpdir, "dummy", idf2, "layer1.0.conv2", av_2, "0") AV.save(tmpdir, "dummy", idf3, "layer1.0.conv2", av_3, "0") dataloader_layer = DataLoader( cast( Dataset, AV.load( tmpdir, "dummy", layer="layer1.0.conv2", ), )) self.assertEqual(len(dataloader_layer), 3) for i, av in enumerate(dataloader_layer): assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0)) dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy"))) self.assertEqual(len(dataloader), 4)
def save_load_and_assert_batch(layer_path, total_num_batches, batch, n_batch_name): # save n-th batch and verify the number of saved batches AV.save( tmpdir, model_id, DEFAULT_IDENTIFIER, "layer1.0.conv1", batch, n_batch_name, ) loaded_dataset = AV.load(tmpdir, model_id, DEFAULT_IDENTIFIER, "layer1.0.conv1", n_batch_name) assertTensorAlmostEqual(self, next(iter(loaded_dataset)), batch, 0.0) loaded_dataset_for_layer = AV.load(tmpdir, model_id, DEFAULT_IDENTIFIER, "layer1.0.conv1") self.assertEqual( loaded_dataset_for_layer.__len__(), total_num_batches, )
def test_equal_activation(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: num_features = 4 low, high = 0, 16 mymodel = BasicLinearReLULinear(num_features) mydata = RangeDataset(low, high, num_features) layers: List[str] = [ value[0] for value in mymodel.named_modules() if value[0] ] # First AV generation on last 2 layers test_input = mydata[1].unsqueeze(0) model_id = "id_1" identifier = "test" num_id = "0" AV._compute_and_save_activations(tmpdir, mymodel, model_id, layers[2], test_input, identifier, num_id) act_dataset = AV.load(tmpdir, model_id, identifier, layers[2], num_id) _layer_act = [act.squeeze(0) for act in DataLoader(act_dataset)] act = torch.cat(_layer_act) out = mymodel(test_input) assertTensorAlmostEqual(self, out, act)
def train_cav( model_id, concepts: List[Concept], layers: Union[str, List[str]], classifier: Classifier, save_path: str, classifier_kwargs: Dict, ) -> Dict[str, Dict[str, CAV]]: r""" A helper function for parallel CAV computations that can be called from a python process. Please see the TCAV class documentation for further information. Args: model_id (str): A unique identifier for the PyTorch model for which we would like to load the layer activations and train a model in order to compute CAVs. concepts (list[Concept]): A list of Concept objects that are used to train a classifier and learn decision boundaries between those concepts for each layer defined in the `layers` argument. layers (str, list[str]): A list of layer names or a single layer name that is used to compute the activations of all concept examples per concept and train a classifier using those activations. classifier (Classifier): A custom classifier class, such as the Sklearn "linear_model" that allows us to train a model using the activation vectors extracted for a layer per concept. It also allows us to access trained weights of the classifier and the list of prediction classes. save_path (str): The path for storing Concept Activation Vectors (CAVs) and Activation Vectors (AVs). classifier_kwargs (dict): Additional named arguments that are passed to concept classifier's `train_and_eval` method. Returns: cavs (dict): A dictionary of CAV objects indexed by concept ids and layer names. It gives access to the weights of each concept in a given layer and model statistics such as accuracies that resulted in trained concept weights. """ concepts_key = concepts_to_str(concepts) cavs: Dict[str, Dict[str, CAV]] = defaultdict() cavs[concepts_key] = defaultdict() layers = [layers] if isinstance(layers, str) else layers for layer in layers: # Create data loader to initialize the trainer. datasets = [ AV.load(save_path, model_id, concept.identifier, layer) for concept in concepts ] labels = [concept.id for concept in concepts] labelled_dataset = LabelledDataset(cast(List[AV.AVDataset], datasets), labels) def batch_collate(batch): inputs, labels = zip(*batch) return torch.cat(inputs), torch.cat(labels) dataloader = DataLoader(labelled_dataset, collate_fn=batch_collate) classifier_stats_dict = classifier.train_and_eval( dataloader, **classifier_kwargs ) classifier_stats_dict = ( {} if classifier_stats_dict is None else classifier_stats_dict ) weights = classifier.weights() assert ( weights is not None and len(weights) > 0 ), "Model weights connot be None or empty" classes = classifier.classes() assert ( classes is not None and len(classes) > 0 ), "Classes cannot be None or empty" classes = ( cast(torch.Tensor, classes).detach().numpy() if isinstance(classes, torch.Tensor) else classes ) cavs[concepts_key][layer] = CAV( concepts, layer, {"weights": weights, "classes": classes, **classifier_stats_dict}, save_path, model_id, ) # Saving cavs on the disk cavs[concepts_key][layer].save() return cavs