Esempio n. 1
0
 def test_av_load_non_saved_layer(self) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
         model_id = "dummy"
         with self.assertRaises(RuntimeError) as context:
             AV.load(tmpdir, model_id)
         self.assertTrue(
             (f"Activation vectors for model {model_id} "
              f"was not found at path {tmpdir}") == str(context.exception))
Esempio n. 2
0
    def test_av_sort_files(self) -> None:
        files = [
            "resnet50-cifar-3000", "resnet50-cifar-1000", "resnet50-cifar-2000"
        ]
        exp_files = [
            "resnet50-cifar-1000",
            "resnet50-cifar-2000",
            "resnet50-cifar-3000",
        ]
        files = AV.sort_files(files)

        self.assertEqual(files, exp_files)

        files = [
            "resnet50-cifar-0900", "resnet50-cifar-0000", "resnet50-cifar-1000"
        ]
        exp_files = [
            "resnet50-cifar-0000",
            "resnet50-cifar-0900",
            "resnet50-cifar-1000",
        ]
        files = AV.sort_files(files)
        self.assertEqual(files, exp_files)

        files = [
            "resnet50-cifar-100", "resnet50-cifar-90", "resnet50-cifar-3000"
        ]
        exp_files = [
            "resnet50-cifar-90",
            "resnet50-cifar-100",
            "resnet50-cifar-3000",
        ]
        files = AV.sort_files(files)
        self.assertEqual(files, exp_files)

        files = [
            "av/pretrained-net-0/fc1-src10-710935.pt",
            "av/pretrained-net-0/fc1-src11-755317.pt",
            "av/pretrained-net-0/fc3-src2-655646.pt",
            "av/pretrained-net-0/fc1-src9-952381.pt",
            "av/pretrained-net-0/conv2-src7-811286.pt",
            "av/pretrained-net-0/fc1-src10-176141.pt",
            "av/pretrained-net-0/conv11-src9-384927.pt",
        ]
        exp_files = [
            "av/pretrained-net-0/conv2-src7-811286.pt",
            "av/pretrained-net-0/conv11-src9-384927.pt",
            "av/pretrained-net-0/fc1-src9-952381.pt",
            "av/pretrained-net-0/fc1-src10-176141.pt",
            "av/pretrained-net-0/fc1-src10-710935.pt",
            "av/pretrained-net-0/fc1-src11-755317.pt",
            "av/pretrained-net-0/fc3-src2-655646.pt",
        ]
        files = AV.sort_files(files)
        self.assertEqual(files, exp_files)
Esempio n. 3
0
    def test_TCAV_generate_all_activations(self) -> None:
        def forward_hook_wrapper(expected_act: Tensor):
            def forward_hook(module, inp, out=None):
                out = torch.reshape(out, (out.shape[0], -1))
                self.assertEqual(out.detach().shape[1:],
                                 expected_act.shape[1:])

            return forward_hook

        with tempfile.TemporaryDirectory() as tmpdirname:
            layers = ["conv1", "conv2", "fc1", "fc2"]
            tcav, concept_dict = init_TCAV(tmpdirname,
                                           CustomClassifier(),
                                           layers=layers)
            tcav.concepts = set(concept_dict.values())

            # generating all activations for given layers and concepts
            tcav.generate_all_activations()

            # verify that all activations exist and have correct shapes
            for layer in layers:
                for _, concept in concept_dict.items():
                    self.assertTrue(
                        AV.exists(tmpdirname, "default_model_id",
                                  concept.identifier, layer))

                concept_meta: Dict[int, int] = defaultdict(int)
                for _, concept in concept_dict.items():
                    activations = AV.load(tmpdirname, "default_model_id",
                                          concept.identifier, layer)

                    def batch_collate(batch):
                        return torch.cat(batch)

                    self.assertTrue(concept.data_iter is not None)
                    assert not (activations is None)
                    for activation in cast(
                            Iterable,
                            DataLoader(activations, collate_fn=batch_collate)):

                        concept_meta[concept.id] += activation.shape[0]

                        layer_module = _get_module_from_name(tcav.model, layer)

                        for data in cast(Iterable, concept.data_iter):
                            hook = layer_module.register_forward_hook(
                                forward_hook_wrapper(activation))
                            tcav.model(data)
                            hook.remove()

                # asserting the length of entire dataset for each concept
                for concept_meta_i in concept_meta.values():
                    self.assertEqual(concept_meta_i, 100)
Esempio n. 4
0
    def test_exists_without_version(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_0 = torch.randn(64, 16)
            self.assertFalse(AV.exists(tmpdir, "dummy", "layer1.0.conv1"))

            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            self.assertTrue(
                AV.exists(
                    tmpdir,
                    "dummy",
                    DEFAULT_IDENTIFIER,
                    "layer1.0.conv1",
                ))
Esempio n. 5
0
    def test_av_load_multiple_batches_per_layer(self) -> None:
        def save_load_and_assert_batch(layer_path, total_num_batches, batch,
                                       n_batch_name):
            # save n-th batch and verify the number of saved batches
            AV.save(
                tmpdir,
                model_id,
                DEFAULT_IDENTIFIER,
                "layer1.0.conv1",
                batch,
                n_batch_name,
            )
            loaded_dataset = AV.load(tmpdir, model_id, DEFAULT_IDENTIFIER,
                                     "layer1.0.conv1", n_batch_name)

            assertTensorAlmostEqual(self, next(iter(loaded_dataset)), batch,
                                    0.0)

            loaded_dataset_for_layer = AV.load(tmpdir, model_id,
                                               DEFAULT_IDENTIFIER,
                                               "layer1.0.conv1")
            self.assertEqual(
                loaded_dataset_for_layer.__len__(),
                total_num_batches,
            )

        with tempfile.TemporaryDirectory() as tmpdir:
            b0 = torch.randn(64, 16)
            b1 = torch.randn(64, 16)
            b2 = torch.randn(64, 16)

            model_id = "dummy"
            model_path = AV._assemble_model_dir(tmpdir, model_id)

            layer_path = AV._assemble_file_path(model_path, DEFAULT_IDENTIFIER,
                                                "layer1.0.conv1")

            # save first batch and verify the number of saved batches
            save_load_and_assert_batch(layer_path, 1, b0, "0")

            # save second batch and verify the number of saved batches
            save_load_and_assert_batch(layer_path, 2, b1, "1")

            # save third batch and verify the number of saved batches
            save_load_and_assert_batch(layer_path, 3, b2, "2")
Esempio n. 6
0
 def save_and_assert_batch(layer_path, total_num_batches, batch,
                           n_batch_name):
     # save n-th batch and verify the number of saved batches
     AV.save(
         tmpdir,
         model_id,
         DEFAULT_IDENTIFIER,
         "layer1.0.conv1",
         batch,
         n_batch_name,
     )
     self.assertEqual(
         len(glob.glob("/".join([layer_path, "*.pt"]))),
         total_num_batches,
     )
     self.assertTrue(
         AV.exists(tmpdir, model_id, DEFAULT_IDENTIFIER,
                   "layer1.0.conv1", n_batch_name))
Esempio n. 7
0
    def load_cavs(
        self, concepts: List[Concept]
    ) -> Tuple[List[str], Dict[Concept, List[str]]]:
        r"""
        This function load CAVs as a dictionary of concept ids and
        layers. CAVs are stored in a directory located under
        `self.save_path` path, in .pkl files with the format:
        <self.save_path>/<concept_ids>-<layer_name>.pkl. Ex.:
        "/cavs/0-1-2-inception4c.pkl", where 0, 1 and 2 are concept ids.

        It returns a list of layers and a dictionary of concept-layers mapping
        for the concepts and layer that require CAV computation through training.
        This can happen if the CAVs aren't already pre-computed for a given list
        of concepts and layer.

        Args:
            concepts (list[Concept]): A list of Concept objects for which we want
                    to load the CAV.

        Returns:
            layers (list[layer]): A list of layers for which some CAVs still need
                    to be computed.
            concept_layers (dict[concept, layer]): A dictionay of concept-layers
                    mapping for which we need to perform CAV computation through
                    training.
        """

        concepts_key = concepts_to_str(concepts)

        layers = []
        concept_layers = defaultdict(list)

        for layer in self.layers:
            self.cavs[concepts_key][layer] = CAV.load(
                self.save_path, self.model_id, concepts, layer
            )

            # If CAV aren't loaded
            if (
                concepts_key not in self.cavs
                or layer not in self.cavs[concepts_key]
                or not self.cavs[concepts_key][layer]
            ):

                layers.append(layer)
                # For all concepts in this experimental_set
                for concept in concepts:
                    # Collect not activated layers for this concept
                    if not AV.exists(
                        self.save_path, self.model_id, layer, concept.identifier
                    ):
                        concept_layers[concept].append(layer)
        return layers, concept_layers
Esempio n. 8
0
    def test_generate_activation(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            num_features = 4
            low, high = 0, 16
            mymodel = BasicLinearReLULinear(num_features)
            mydata = RangeDataset(low, high, num_features)
            layers: List[str] = [
                value[0] for value in mymodel.named_modules() if value[0]
            ]

            # First AV generation on last 2 layers
            inputs = torch.stack((mydata[1], mydata[8], mydata[14]))
            AV._compute_and_save_activations(tmpdir, mymodel, "model_id_1",
                                             layers[1:], inputs, "test", "0")

            av_test = AV._construct_file_search(tmpdir,
                                                "model_id_1",
                                                identifier="test")
            av_test = glob.glob(av_test)
            self.assertEqual(len(av_test), len(layers[1:]))

            # Second AV generation on first 2 layers.
            # Second layer overlaps with existing activations, should be loaded.
            inputs = torch.stack((mydata[0], mydata[7], mydata[13]))
            AV._compute_and_save_activations(tmpdir, mymodel, "model_id_1",
                                             layers[:2], inputs, "test", "0")

            av_test = AV._construct_file_search(tmpdir,
                                                "model_id_1",
                                                identifier="test")
            av_test = glob.glob(av_test)
            self.assertEqual(len(av_test), len(layers))
Esempio n. 9
0
    def test_av_load_all_layers_one_identifier(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_01 = torch.randn(36, 16)
            av_02 = torch.randn(16, 16)
            av_03 = torch.randn(4, 16)
            avs_0 = [av_01, av_02, av_03]

            av_11 = torch.randn(36, 16)
            av_12 = torch.randn(16, 16)
            av_13 = torch.randn(4, 16)
            avs_1 = [av_11, av_12, av_13]

            idf1, idf2 = "idf1", "idf2"

            AV.save(
                tmpdir,
                "dummy",
                idf1,
                ["layer1.0.conv1", "layer1.0.conv2", "layer1.1.conv1"],
                avs_0,
                "0",
            )
            dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy")))
            self.assertEqual(len(dataloader), 3)

            AV.save(
                tmpdir,
                "dummy",
                idf2,
                ["layer1.0.conv1", "layer1.0.conv2", "layer1.1.conv1"],
                avs_1,
                "0",
            )
            dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy")))
            self.assertEqual(len(dataloader), 6)

            # check activations for idf1
            dataloader_layer = DataLoader(
                cast(Dataset, AV.load(tmpdir, "dummy", identifier=idf1)))
            self.assertEqual(len(dataloader_layer), 3)

            for i, av in enumerate(dataloader_layer):
                assertTensorAlmostEqual(self, av, avs_0[i].unsqueeze(0))

            # check activations for idf2
            dataloader_layer = DataLoader(
                cast(Dataset, AV.load(tmpdir, "dummy", identifier=idf2)))
            self.assertEqual(len(dataloader_layer), 3)
            for i, av in enumerate(dataloader_layer):
                assertTensorAlmostEqual(self, av, avs_1[i].unsqueeze(0))
Esempio n. 10
0
    def test_exists_with_version(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            idf1 = str(int(datetime.now().microsecond))
            idf2 = "idf2"
            av_0 = torch.randn(64, 16)

            self.assertFalse(AV.exists(tmpdir, "dummy", "layer1.0.conv1",
                                       idf1))
            self.assertFalse(AV.exists(tmpdir, "dummy", "layer1.0.conv1",
                                       idf2))

            AV.save(tmpdir, "dummy", idf1, "layer1.0.conv1", av_0, "0")
            self.assertTrue(AV.exists(tmpdir, "dummy", idf1, "layer1.0.conv1"))
            self.assertFalse(AV.exists(tmpdir, "dummy", idf2,
                                       "layer1.0.conv1"))

            AV.save(tmpdir, "dummy", idf2, "layer1.0.conv1", av_0, "0")
            self.assertTrue(AV.exists(tmpdir, "dummy", idf2, "layer1.0.conv1"))
Esempio n. 11
0
    def test_av_save_multiple_batches_per_layer(self) -> None:
        def save_and_assert_batch(layer_path, total_num_batches, batch,
                                  n_batch_name):
            # save n-th batch and verify the number of saved batches
            AV.save(
                tmpdir,
                model_id,
                DEFAULT_IDENTIFIER,
                "layer1.0.conv1",
                batch,
                n_batch_name,
            )
            self.assertEqual(
                len(glob.glob("/".join([layer_path, "*.pt"]))),
                total_num_batches,
            )
            self.assertTrue(
                AV.exists(tmpdir, model_id, DEFAULT_IDENTIFIER,
                          "layer1.0.conv1", n_batch_name))

        with tempfile.TemporaryDirectory() as tmpdir:
            b0 = torch.randn(64, 16)
            b1 = torch.randn(64, 16)
            b2 = torch.randn(64, 16)

            model_id = "dummy"
            model_path = AV._assemble_model_dir(tmpdir, model_id)

            layer_path = AV._assemble_file_path(model_path, DEFAULT_IDENTIFIER,
                                                "layer1.0.conv1")

            # save first batch and verify the number of saved batches
            save_and_assert_batch(layer_path, 1, b0, "0")

            # save second batch and verify the number of saved batches
            save_and_assert_batch(layer_path, 2, b1, "1")

            # save third batch and verify the number of saved batches
            save_and_assert_batch(layer_path, 3, b2, "2")
Esempio n. 12
0
    def generate_activation(self, layers: Union[str, List], concept: Concept) -> None:
        r"""
        Computes layer activations for the specified `concept` and
        the list of layer(s) `layers`.

        Args:
            layers (str, list[str]): A list of layer names or a layer name
                    that is used to compute layer activations for the
                    specific `concept`.
            concept (Concept): A single Concept object that provides access
                    to concept examples using a data iterator.
        """
        layers = [layers] if isinstance(layers, str) else layers
        layer_modules = [_get_module_from_name(self.model, layer) for layer in layers]

        layer_act = LayerActivation(self.model, layer_modules)
        assert concept.data_iter is not None, (
            "Data iterator for concept id:",
            "{} must be specified".format(concept.id),
        )
        for i, examples in enumerate(concept.data_iter):
            activations = layer_act.attribute.__wrapped__(  # type: ignore
                layer_act,
                examples,
                attribute_to_layer_input=self.attribute_to_layer_input,
            )
            for activation, layer_name in zip(activations, layers):
                activation = torch.reshape(activation, (activation.shape[0], -1))
                AV.save(
                    self.save_path,
                    self.model_id,
                    concept.identifier,
                    layer_name,
                    activation.detach(),
                    str(i),
                )
Esempio n. 13
0
    def test_equal_activation(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            num_features = 4
            low, high = 0, 16
            mymodel = BasicLinearReLULinear(num_features)
            mydata = RangeDataset(low, high, num_features)
            layers: List[str] = [
                value[0] for value in mymodel.named_modules() if value[0]
            ]

            # First AV generation on last 2 layers
            test_input = mydata[1].unsqueeze(0)
            model_id = "id_1"
            identifier = "test"
            num_id = "0"
            AV._compute_and_save_activations(tmpdir, mymodel, model_id,
                                             layers[2], test_input, identifier,
                                             num_id)
            act_dataset = AV.load(tmpdir, model_id, identifier, layers[2],
                                  num_id)
            _layer_act = [act.squeeze(0) for act in DataLoader(act_dataset)]
            act = torch.cat(_layer_act)
            out = mymodel(test_input)
            assertTensorAlmostEqual(self, out, act)
Esempio n. 14
0
        def save_load_and_assert_batch(layer_path, total_num_batches, batch,
                                       n_batch_name):
            # save n-th batch and verify the number of saved batches
            AV.save(
                tmpdir,
                model_id,
                DEFAULT_IDENTIFIER,
                "layer1.0.conv1",
                batch,
                n_batch_name,
            )
            loaded_dataset = AV.load(tmpdir, model_id, DEFAULT_IDENTIFIER,
                                     "layer1.0.conv1", n_batch_name)

            assertTensorAlmostEqual(self, next(iter(loaded_dataset)), batch,
                                    0.0)

            loaded_dataset_for_layer = AV.load(tmpdir, model_id,
                                               DEFAULT_IDENTIFIER,
                                               "layer1.0.conv1")
            self.assertEqual(
                loaded_dataset_for_layer.__len__(),
                total_num_batches,
            )
Esempio n. 15
0
    def test_av_load_one_batch(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_0 = torch.randn(64, 16)
            av_1 = torch.randn(36, 16)
            avs = [av_0, av_1]

            # add av_0 to the list of activations
            model_id = "dummy"
            with self.assertRaises(RuntimeError) as context:
                AV.load(tmpdir, model_id)
            self.assertTrue(
                (f"Activation vectors for model {model_id} "
                 f"was not found at path {tmpdir}") == str(context.exception))

            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            model_id = "dummy"
            dataset = AV.load(tmpdir, model_id, identifier=DEFAULT_IDENTIFIER)

            for i, av in enumerate(DataLoader(cast(Dataset, dataset))):
                assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0))

            # add av_1 to the list of activations
            dataloader_2 = DataLoader(
                cast(
                    Dataset,
                    AV.load(tmpdir, "dummy", DEFAULT_IDENTIFIER,
                            "layer1.0.conv2"),
                ))
            self.assertEqual(len(dataloader_2), 0)

            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv2",
                    av_1, "0")
            dataset = AV.load(tmpdir, "dummy", identifier=DEFAULT_IDENTIFIER)

            dataloader = DataLoader(cast(Dataset, dataset))
            self.assertEqual(len(dataloader), 2)
            for i, av in enumerate(dataloader):
                assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0))
Esempio n. 16
0
    def test_av_save_two_layers(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_0 = torch.randn(64, 16)

            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            self.assertTrue(
                AV.exists(tmpdir, "dummy", DEFAULT_IDENTIFIER,
                          "layer1.0.conv1"))
            self.assertFalse(
                AV.exists(tmpdir, "dummy", DEFAULT_IDENTIFIER,
                          "layer1.0.conv2"))

            # experimenting with adding to another layer
            av_1 = torch.randn(64, 16)
            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv2",
                    av_1, "0")
            self.assertTrue(
                AV.exists(tmpdir, "dummy", DEFAULT_IDENTIFIER,
                          "layer1.0.conv2"))
Esempio n. 17
0
    def test_model_ids_in_tcav(self, ) -> None:
        # creating concepts and mapping between concepts and their names
        concepts_dict = create_concepts()

        # defining experimental sets of different length
        experimental_set_list = [["striped", "random"], ["dotted", "random"]]
        experimental_sets = self._create_experimental_sets(
            experimental_set_list, concepts_dict)
        model = BasicModel_ConvNet()
        model.eval()
        layer = "conv2"
        inputs = 100 * get_inputs_tensor()

        with tempfile.TemporaryDirectory() as tmpdirname:
            tcav1 = TCAV(
                model,
                layer,
                model_id="my_basic_model1",
                classifier=CustomClassifier(),
                save_path=tmpdirname,
            )

            interpret1 = tcav1.interpret(inputs,
                                         experimental_sets=experimental_sets,
                                         target=0)

            tcav2 = TCAV(
                model,
                layer,
                model_id="my_basic_model2",
                classifier=CustomClassifier(),
                save_path=tmpdirname,
            )
            interpret2 = tcav2.interpret(inputs,
                                         experimental_sets=experimental_sets,
                                         target=0)

            # testing that different folders were created for two different
            # ids of the model
            self.assertTrue(
                AV.exists(
                    tmpdirname,
                    "my_basic_model1",
                    concepts_dict["striped"].identifier,
                    layer,
                ))
            self.assertTrue(
                AV.exists(
                    tmpdirname,
                    "my_basic_model2",
                    concepts_dict["striped"].identifier,
                    layer,
                ))
            for interpret1_elem, interpret2_elem in zip(
                    interpret1, interpret2):
                for interpret1_sub_elem, interpret2_sub_elem in zip(
                        interpret1[interpret1_elem],
                        interpret2[interpret2_elem]):
                    assertTensorAlmostEqual(
                        self,
                        interpret1[interpret1_elem][interpret1_sub_elem]
                        ["sign_count"],
                        interpret2[interpret2_elem][interpret2_sub_elem]
                        ["sign_count"],
                        0.0,
                    )
                    assertTensorAlmostEqual(
                        self,
                        interpret1[interpret1_elem][interpret1_sub_elem]
                        ["magnitude"],
                        interpret2[interpret2_elem][interpret2_sub_elem]
                        ["magnitude"],
                        0.0,
                    )
                    self.assertEqual(interpret1_sub_elem, interpret2_sub_elem)

                self.assertEqual(interpret1_elem, interpret2_elem)
Esempio n. 18
0
def train_cav(
    model_id,
    concepts: List[Concept],
    layers: Union[str, List[str]],
    classifier: Classifier,
    save_path: str,
    classifier_kwargs: Dict,
) -> Dict[str, Dict[str, CAV]]:
    r"""
    A helper function for parallel CAV computations that can be called
    from a python process.

    Please see the TCAV class documentation for further information.

    Args:
        model_id (str): A unique identifier for the PyTorch model for which
                we would like to load the layer activations and train a
                model in order to compute CAVs.
        concepts (list[Concept]): A list of Concept objects that are used
                to train a classifier and learn decision boundaries between
                those concepts for each layer defined in the `layers`
                argument.
        layers (str, list[str]): A list of layer names or a single layer
                name that is used to compute the activations of all concept
                examples per concept and train a classifier using those
                activations.
        classifier (Classifier): A custom classifier class, such as the
                Sklearn "linear_model" that allows us to train a model
                using the activation vectors extracted for a layer per concept.
                It also allows us to access trained weights of the classifier
                and the list of prediction classes.
        save_path (str): The path for storing Concept Activation
                Vectors (CAVs) and Activation Vectors (AVs).
        classifier_kwargs (dict): Additional named arguments that are passed to
                concept classifier's `train_and_eval` method.

    Returns:
        cavs (dict): A dictionary of CAV objects indexed by concept ids and
                layer names. It gives access to the weights of each concept
                in a given layer and model statistics such as accuracies
                that resulted in trained concept weights.
    """

    concepts_key = concepts_to_str(concepts)
    cavs: Dict[str, Dict[str, CAV]] = defaultdict()
    cavs[concepts_key] = defaultdict()
    layers = [layers] if isinstance(layers, str) else layers
    for layer in layers:

        # Create data loader to initialize the trainer.
        datasets = [
            AV.load(save_path, model_id, concept.identifier, layer)
            for concept in concepts
        ]

        labels = [concept.id for concept in concepts]

        labelled_dataset = LabelledDataset(cast(List[AV.AVDataset], datasets), labels)

        def batch_collate(batch):
            inputs, labels = zip(*batch)
            return torch.cat(inputs), torch.cat(labels)

        dataloader = DataLoader(labelled_dataset, collate_fn=batch_collate)

        classifier_stats_dict = classifier.train_and_eval(
            dataloader, **classifier_kwargs
        )
        classifier_stats_dict = (
            {} if classifier_stats_dict is None else classifier_stats_dict
        )

        weights = classifier.weights()
        assert (
            weights is not None and len(weights) > 0
        ), "Model weights connot be None or empty"

        classes = classifier.classes()
        assert (
            classes is not None and len(classes) > 0
        ), "Classes cannot be None or empty"

        classes = (
            cast(torch.Tensor, classes).detach().numpy()
            if isinstance(classes, torch.Tensor)
            else classes
        )
        cavs[concepts_key][layer] = CAV(
            concepts,
            layer,
            {"weights": weights, "classes": classes, **classifier_stats_dict},
            save_path,
            model_id,
        )
        # Saving cavs on the disk
        cavs[concepts_key][layer].save()

    return cavs
Esempio n. 19
0
    def influence(  # type: ignore[override]
        self,
        inputs: Union[Tensor, Tuple[Tensor, ...]],
        top_k: int = 1,
        additional_forward_args: Optional[Any] = None,
        load_src_from_disk: bool = True,
        **kwargs: Any,
    ) -> Dict:
        r"""
        Args:
            inputs (tensor or tuple of tensors): Batch of examples for which influential
                    instances are computed. They are passed to the forward_func. The
                    first dimension in `inputs` tensor or tuple of tensors corresponds
                    to the batch size. A tuple of tensors is only passed in if this
                    is the input form that `module` accepts.
            top_k (int): The number of top-matching activations to return
            additional_forward_args (optional):  Additional arguments that will be
                    passed to forward_func after inputs.
            load_src_from_disk (bool): Loads activations for `influence_src_dataset`
                    where possible. Setting to False would force regeneration of
                    activations.
            load_input_from_disk (bool): Regenerates activations for inputs by default
                    and removes previous `inputs` activations that are flagged with
                    `inputs_id`. Setting to True will load prior matching inputs
                    activations. Note that this could lead to unexpected behavior if
                    `inputs_id` is not configured properly and activations are loaded
                    for a different, prior `inputs`.
            inputs_id (str): Used to identify inputs for loading activations.

            **kwargs: Additional key-value arguments that are necessary for specific
                    implementation of `DataInfluence` abstract class.

        Returns:

            influences (dict): Returns the influential instances retrieved from
            `influence_src_dataset` for each test example represented through a
            tensor or a tuple of tensor in `inputs`. Returned influential
            examples are represented as dict, with keys corresponding to
            the layer names passed in `layers`. Each value in the dict is a
            tuple containing the indices and values for the top k similarities
            from `influence_src_dataset` by the chosen metric. The first value
            in the tuple corresponds to the indices corresponding to the top k
            most similar examples, and the second value is the similarity score.
            The batch dimension corresponds to the batch dimension of `inputs`.
            If inputs.shape[0] == 5, then dict[`layer_name`][0].shape[0] == 5.
            These tensors will be of shape (inputs.shape[0], top_k).
        """
        inputs_batch_size = (inputs[0].shape[0]
                             if isinstance(inputs, tuple) else inputs.shape[0])

        influences: Dict[str, Any] = {}

        layer_AVDatasets = AV.generate_dataset_activations(
            self.activation_dir,
            self.module,
            self.model_id,
            self.layers,
            DataLoader(self.influence_src_dataset,
                       self.batch_size,
                       shuffle=False),
            identifier="src",
            load_from_disk=load_src_from_disk,
            return_activations=True,
        )

        assert layer_AVDatasets is not None and not isinstance(
            layer_AVDatasets, AV.AVDataset)

        layer_modules = [
            common._get_module_from_name(self.module, layer)
            for layer in self.layers
        ]
        test_activations = LayerActivation(self.module,
                                           layer_modules).attribute(
                                               inputs, additional_forward_args)

        minmax = self.similarity_direction == "max"

        # av_inputs shape: (inputs_batch_size, *) e.g. (inputs_batch_size, N, C, H, W)
        # av_src shape: (self.batch_size, *) e.g. (self.batch_size, N, C, H, W)
        test_activations = (test_activations
                            if len(self.layers) > 1 else [test_activations])
        for i, (layer, layer_AVDataset) in enumerate(
                zip(self.layers, layer_AVDatasets)):
            topk_val, topk_idx = torch.Tensor(), torch.Tensor().long()
            zero_acts = torch.Tensor().long()

            av_inputs = test_activations[i]
            src_loader = DataLoader(layer_AVDataset)
            for j, av_src in enumerate(src_loader):
                av_src = av_src.squeeze(0)

                similarity = self.similarity_metric(av_inputs, av_src)
                msg = (
                    "Output of custom similarity does not meet required dimensions. "
                    f"Your output has shape {similarity.shape}.\nPlease ensure the "
                    "output shape matches (inputs_batch_size, src_dataset_batch_size), "
                    f"which should be {(inputs_batch_size, self.batch_size)}.")
                assert similarity.shape == (inputs_batch_size,
                                            av_src.shape[0]), msg
                if hasattr(self, "replace_nan"):
                    idx = (similarity == self.replace_nan).nonzero()
                    zero_acts = torch.cat((zero_acts, idx))
                r"""
                TODO: For models that can have tuples as activations, we should
                allow similarity metrics to accept tuples, support topk selection.
                """

                topk_batch = min(top_k, self.batch_size)
                values, indices = torch.topk(similarity,
                                             topk_batch,
                                             dim=1,
                                             largest=minmax)
                indices += int(j * self.batch_size)

                topk_val = torch.cat((topk_val, values), dim=1)
                topk_idx = torch.cat((topk_idx, indices), dim=1)

                # can modify how often to sort for efficiency? minor
                sort_idx = torch.argsort(topk_val, dim=1, descending=minmax)
                topk_val = torch.gather(topk_val, 1, sort_idx[:, :top_k])
                topk_idx = torch.gather(topk_idx, 1, sort_idx[:, :top_k])

            influences[layer] = (topk_idx, topk_val)

            if torch.numel(zero_acts != 0):
                zero_warning = (
                    f"Layer {layer} has zero-vector activations for some inputs. This "
                    "may cause undefined behavior for cosine similarity. The indices "
                    "for the offending inputs will be included under the key "
                    f"'zero_acts-{layer}' in the output dictionary. Indices are "
                    "returned as a tensor with [inputs_idx, src_dataset_idx] pairs "
                    "which may have corrupted similarity scores.")
                warnings.warn(zero_warning, RuntimeWarning)
                key = "-".join(["zero_acts", layer])
                influences[key] = zero_acts

        return influences
Esempio n. 20
0
    def test_av_save_multi_layer(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_0 = torch.randn(64, 16)
            av_1 = torch.randn(64, 16)
            av_2 = torch.randn(64, 16)

            model_path = AV._assemble_model_dir(tmpdir, "dummy")

            # save first layer
            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            self.assertEqual(len(glob.glob(model_path + "*")), 1)

            # add two new layers at once
            AV.save(
                tmpdir,
                "dummy",
                DEFAULT_IDENTIFIER,
                ["layer1.0.conv2", "layer1.1.conv1"],
                [av_1, av_2],
                "0",
            )

            self.assertEqual(len(glob.glob(model_path + "/*/*/*")), 3)

            # overwrite the first saved layer
            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            self.assertEqual(len(glob.glob(model_path + "/*/*/*")), 3)

            # save a new version of the first layer
            idf1 = str(int(datetime.now().microsecond))
            self.assertFalse(AV.exists(tmpdir, "dummy", idf1,
                                       "layer1.0.conv1"))
            AV.save(tmpdir, "dummy", idf1, "layer1.0.conv1", av_0, "0")

            self.assertTrue(AV.exists(tmpdir, "dummy", idf1, "layer1.0.conv1"))
            self.assertEqual(len(glob.glob(model_path + "/*/*/*")), 4)
Esempio n. 21
0
    def test_generate_dataset_activations(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            num_features = 4
            low, high = 0, 16
            batch_size = high // 2
            mymodel = BasicLinearReLULinear(num_features)
            mydata = RangeDataset(low, high, num_features)
            layers: List[str] = [
                value[0] for value in mymodel.named_modules() if value[0]
            ]

            # First AV generation on last 2 layers
            layer_AVDatasets = AV.generate_dataset_activations(
                tmpdir,
                mymodel,
                "model_id1",
                layers[1:],
                DataLoader(mydata, batch_size, shuffle=False),
                "src",
                return_activations=True,
            )

            av_src = AV._construct_file_search(tmpdir,
                                               model_id="model_id1",
                                               identifier="src")
            av_src = glob.glob(av_src)
            self.assertEqual(len(av_src), high / batch_size * len(layers[1:]))

            self.assertTrue(isinstance(layer_AVDatasets, list))
            layer_AVDatasets = cast(list, layer_AVDatasets)
            self.assertEqual(len(layer_AVDatasets), len(layers[1:]))
            for layer_AVDataset in layer_AVDatasets:
                self.assertEqual(len(layer_AVDataset), high / batch_size)

            # Second AV generation on first 2 layers.
            # Second layer overlaps with existing activations, should be loaded.
            layer_AVDatasets = AV.generate_dataset_activations(
                tmpdir,
                mymodel,
                "model_id1",
                layers[:2],
                DataLoader(mydata, batch_size, shuffle=False),
                "src",
                return_activations=True,
            )

            av_src = AV._construct_file_search(tmpdir,
                                               model_id="model_id1",
                                               identifier="src")
            av_src = glob.glob(av_src)
            self.assertEqual(len(av_src), high / batch_size * len(layers))

            self.assertTrue(isinstance(layer_AVDatasets, list))
            layer_AVDatasets = cast(list, layer_AVDatasets)
            self.assertEqual(len(layer_AVDatasets), len(layers[:2]))
            for layer_AVDataset in layer_AVDatasets:
                self.assertEqual(len(layer_AVDataset), high / batch_size)

            # check that if return_activations is False, None is returned
            self.assertIsNone(
                AV.generate_dataset_activations(
                    tmpdir,
                    mymodel,
                    "model_id1",
                    layers[:2],
                    DataLoader(mydata, batch_size, shuffle=False),
                    "src",
                    return_activations=False,
                ))
Esempio n. 22
0
    def test_av_load_all_identifiers_one_layer(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            av_0 = torch.randn(64, 16)
            av_1 = torch.randn(36, 16)
            av_2 = torch.randn(16, 16)
            av_3 = torch.randn(4, 16)
            avs = [av_1, av_2, av_3]

            idf1, idf2, idf3 = "idf1", "idf2", "idf3"

            AV.save(tmpdir, "dummy", DEFAULT_IDENTIFIER, "layer1.0.conv1",
                    av_0, "0")
            dataloader = DataLoader(
                cast(Dataset,
                     AV.load(tmpdir, "dummy", identifier=DEFAULT_IDENTIFIER)))
            self.assertEqual(len(dataloader), 1)

            # add activations for another layer
            AV.save(tmpdir, "dummy", idf1, "layer1.0.conv2", av_1, "0")
            AV.save(tmpdir, "dummy", idf2, "layer1.0.conv2", av_2, "0")
            AV.save(tmpdir, "dummy", idf3, "layer1.0.conv2", av_3, "0")
            dataloader_layer = DataLoader(
                cast(
                    Dataset,
                    AV.load(
                        tmpdir,
                        "dummy",
                        layer="layer1.0.conv2",
                    ),
                ))

            self.assertEqual(len(dataloader_layer), 3)
            for i, av in enumerate(dataloader_layer):
                assertTensorAlmostEqual(self, av, avs[i].unsqueeze(0))

            dataloader = DataLoader(cast(Dataset, AV.load(tmpdir, "dummy")))
            self.assertEqual(len(dataloader), 4)