Beispiel #1
0
  def test_tcav_sample_from_positive(self):
    # Tests the case where more concept examples are passed than non-concept
    # examples, so the concept set is sampled from the concept examples.

    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'a'},
        {'sentence': 'b'},
        {'sentence': 'c'},
        {'sentence': 'd'},
        {'sentence': 'e'},
        {'sentence': 'f'},
        {'sentence': 'g'},
        {'sentence': 'h'}]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[4]['id'],
                            indexed_inputs[7]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }
    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'p_val': 0.80489,
        'random_mean': 0.53333,
        'result': {
            'score': 0.8,
            'cos_sim': [
                0.09527, -0.20442, 0.05141,
                0.14985, 0.06750, -0.28244,
                -0.11022, -0.14479
            ],
            'dot_prods': [
                152.48776, -335.64998, 82.99588,
                247.80113, 109.53684, -461.81805,
                -181.29095, -239.47817
            ],
            'accuracy': 1.0
        }
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])
Beispiel #2
0
  def test_tcav(self):
    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'a'},
        {'sentence': 'b'},
        {'sentence': 'c'},
        {'sentence': 'd'},
        {'sentence': 'e'},
        {'sentence': 'f'},
        {'sentence': 'g'},
        {'sentence': 'h'},
        {'sentence': 'i'}]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[7]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0,
        'dataset_name': 'test'
    }
    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'p_val': 0.13311,
        'random_mean': 0.56667,
        'result': {
            'score': 0.33333,
            'cos_sim': [
                0.088691, -0.12179, 0.16013,
                0.24840, -0.09793, 0.05166,
                -0.21578, -0.06560, -0.14759
            ],
            'dot_prods': [
                189.085096, -266.36317, 344.350498,
                547.144949, -211.663965, 112.502439,
                -472.72066, -144.529598, -323.31888
            ],
            'accuracy': 0.66667
        }
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])
Beispiel #3
0
    def setUp(self):
        super().setUp()
        dataset = lit_dataset.IndexedDataset(base=RegressionTestDataset(),
                                             id_fn=caching.input_hash)
        self._dataset = dataset
        self._model = RegressionTestModel()
        self._gen = minimal_targeted_counterfactuals.TabularMTC()

        self._example = {'x_1': 1.0, 'x_2': 1.0}

        self._config = {
            'Prediction key': 'score',
            'dataset_name': 'regression_test_dataset'
        }
Beispiel #4
0
    def setUp(self):
        super(ThresholderTest, self).setUp()
        self.thresholder = thresholder.Thresholder()
        self.model = caching.CachingModelWrapper(
            glue_models.SST2Model(BERT_TINY_PATH), 'test')
        examples = [{
            'sentence': 'a',
            'label': '1'
        }, {
            'sentence': 'b',
            'label': '1'
        }, {
            'sentence': 'c',
            'label': '1'
        }, {
            'sentence': 'd',
            'label': '1'
        }, {
            'sentence': 'e',
            'label': '1'
        }, {
            'sentence': 'f',
            'label': '0'
        }, {
            'sentence': 'g',
            'label': '0'
        }, {
            'sentence': 'h',
            'label': '0'
        }, {
            'sentence': 'i',
            'label': '0'
        }]

        self.indexed_inputs = [{
            'id': caching.input_hash(ex),
            'data': ex
        } for ex in examples]
        self.dataset = lit_dataset.IndexedDataset(
            id_fn=caching.input_hash,
            spec={
                'sentence': lit_types.TextSegment(),
                'label': lit_types.CategoryLabel(vocab=['0', '1'])
            },
            indexed_examples=self.indexed_inputs)
        self.model_outputs = list(
            self.model.predict_with_metadata(self.indexed_inputs,
                                             dataset_name='test'))
Beispiel #5
0
    def setUp(self):
        super().setUp()
        dataset = lit_dataset.IndexedDataset(base=ClassificationTestDataset(),
                                             id_fn=caching.input_hash)
        self._dataset = dataset
        self._model = ClassificationTestModel(self._dataset)
        self._gen = minimal_targeted_counterfactuals.TabularMTC()

        self._example = {
            'size': 'large',
            'weight': 1.2,
            'legs': False,
            'description': 'big water animal',
            'animal': 'whale'
        }

        self._config = {
            'Prediction key': 'preds',
            'dataset_name': 'classification_test_dataset'
        }
Beispiel #6
0
    def test_run_nn(self):
        examples = [
            {
                'segment': 'a'
            },
            {
                'segment': 'b'
            },
            {
                'segment': 'c'
            },
        ]
        indexed_inputs = [{
            'id': caching.input_hash(ex),
            'data': ex
        } for ex in examples]

        model = TestModelNearestNeighbors()
        dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                             indexed_examples=indexed_inputs)
        config = {
            'embedding_name': 'input_embs',
            'num_neighbors': 2,
        }
        result = self.nearest_neighbors.run_with_metadata([indexed_inputs[1]],
                                                          model,
                                                          dataset,
                                                          config=config)
        expected = {
            'nearest_neighbors': [{
                'id': '1',
                'nn_distance': 0.0
            }, {
                'id': '0',
                'nn_distance': 1.7320508075688772
            }]
        }

        self.assertLen(result, 1)
        testing_utils.assert_deep_almost_equal(self, expected, result[0])
Beispiel #7
0
 def setUp(self):
   super(PdpTest, self).setUp()
   self.pdp = pdp.PdpInterpreter()
   self.reg_model = TestRegressionPdp()
   self.class_model = TestClassificationPdp()
   examples = [
       {
           'num': 1,
           'cats': 'One',
       },
       {
           'num': 10,
           'cats': 'None',
       },
       {
           'num': 5,
           'cats': 'One',
       },
   ]
   indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                     for ex in examples]
   self.dataset = lit_dataset.IndexedDataset(
       spec=self.reg_model.input_spec(), id_fn=caching.input_hash,
       indexed_examples=indexed_inputs)
Beispiel #8
0
  def test_relative_tcav(self):
    # Tests passing in a negative set.

    random.seed(0)  # Sets seed since create_comparison_splits() uses random.

    # Basic test with dummy outputs from the model.
    examples = [
        {'sentence': 'happy'},  # 0
        {'sentence': 'sad'},  # 1
        {'sentence': 'good'},  # 2
        {'sentence': 'bad'},  # 3
        {'sentence': 'pretty'},  # 4
        {'sentence': 'ugly'},  # 5
        {'sentence': 'sweet'},  # 6
        {'sentence': 'bitter'},  # 7
        {'sentence': 'well'},  # 8
        {'sentence': 'poor'},  # 9
        {'sentence': 'compelling'},  # 10
        {'sentence': 'boring'},  # 11
        {'sentence': 'pleasing'},  # 12
        {'sentence': 'gross'},  # 13
        {'sentence': 'blue'},  # 14
        {'sentence': 'red'},  # 15
        {'sentence': 'flower'},  # 16
        {'sentence': 'bee'},  # 17
        {'sentence': 'snake'},  # 18
        {'sentence': 'windshield'},  # 19
        {'sentence': 'plant'},  # 20
        {'sentence': 'scary'},  # 21
        {'sentence': 'pencil'},  # 22
        {'sentence': 'hello'}  # 23
    ]

    indexed_inputs = [{'id': caching.input_hash(ex), 'data': ex}
                      for ex in examples]
    dataset = lit_dataset.IndexedDataset(id_fn=caching.input_hash,
                                         indexed_examples=indexed_inputs)

    # This first example doesn't have enough examples for statistical testing,
    # so the returned p-value is None.
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[4]['id']],
        'negative_set_ids': [indexed_inputs[1]['id'],
                             indexed_inputs[3]['id'],
                             indexed_inputs[5]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)

    self.assertLen(result, 1)
    expected = {
        'result': {
            'score': 1.0,
            'cos_sim': [
                0.9999999581246426, 0.049332143689572144, 0.8987945047547466,
                -0.41858423757857954, 0.6908297036543664, -0.5167857909664919,
                0.8423017503220364, -0.005793079244916016, 0.8334491603894322,
                -0.4054645113448612, 0.7616102123736647, -0.4578596155267783,
                0.8366905563807711, -0.27390786544756535, 0.7325538474066896,
                0.5190287630768531, 0.8145227936096425, 0.02005592868363552,
                -0.1143256029298114, -0.1221480700842533, 0.6852995739227957,
                0.3984620730733816, 0.5211149530112407, 0.5909723902471223
            ],
            'dot_prods': [
                1385.1480610241554, 69.95638452724207, 1239.4947646060161,
                -595.253135700978, 971.5880156862692, -725.0749813217176,
                1182.8641913758102, -8.149647641120662, 1146.5803071544124,
                -576.4043054391316, 1038.3510704649307, -648.097269442522,
                1154.4720122394317, -378.32103870822493, 1024.066390571124,
                738.6959135414066, 1139.7963358416857, 28.691395032352318,
                -167.37808507284706, -176.4474746971391, 959.5159619261449,
                562.8772536987927, 716.7270332848395, 840.7031847912738
            ],
            'accuracy': 0.5
        },
        'p_val': None,
        'random_mean': 0.9285714285714286,
        'split_size': 3,
        'num_runs': 1
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])

    # This example has enough inputs for two runs of size 3.
    config = {
        'concept_set_ids': [
            indexed_inputs[1]['id'], indexed_inputs[2]['id'],
            indexed_inputs[4]['id'], indexed_inputs[5]['id'],
            indexed_inputs[10]['id'], indexed_inputs[9]['id']
        ],
        'negative_set_ids': [
            indexed_inputs[0]['id'], indexed_inputs[3]['id'],
            indexed_inputs[12]['id'], indexed_inputs[6]['id'],
            indexed_inputs[7]['id'], indexed_inputs[8]['id']
        ],
        'class_to_explain': '0',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)
    self.assertLen(result, 1)
    expected = {
        'result': {
            'score': 0.0,
            'cos_sim': [
                0.2731987606830683, 0.427838045403812, 0.3166440584420665,
                -0.1358964965831398, 0.5616614702946262, -0.16511808390168164,
                -0.05103355252438478, -0.16945565920473257, 0.28148962348967155,
                -0.18169036476392003, 0.33244873698665106, -0.13316476546155087,
                0.15226772288202886, -0.05534469666649352, 0.2886150002073456,
                0.33888135113008555, 0.12875301375254147, 0.046908665182593096,
                -0.052445114502024985, 0.088858405172313, 0.219517174438115,
                0.35833013079793435, 0.2291162415605806, 0.3635686086637199
            ],
            'dot_prods': [
                452.17220644153525, 724.9460578876271, 521.776546745851,
                -230.9170522777958, 943.8754747127095, -276.8190148523963,
                -85.63511897570154, -284.8487792023684, 462.71830216201926,
                -308.62790255581496, 541.5830529968077, -225.2299308998058,
                251.04716264718752, -91.33998249705493, 482.0991668852444,
                576.3029773313335, 215.28329927312336, 80.18458502795752,
                -91.74640483442752, 153.37559992294862, 367.2562273288043,
                604.8378479001944, 376.53473821563625, 618.003311205616
            ],
            'accuracy': 0.5
        },
        'p_val': 0.42264973081037427,
        'random_mean': 0.0,
        'split_size': 3,
        'num_runs': 2
    }

    testing_utils.assert_deep_almost_equal(self, expected, result[0])

    # This example has enough examples for three runs of size 3 and two runs of
    # size 5, and returns results with p-value < 0.05.
    config = {
        'concept_set_ids': [indexed_inputs[0]['id'],
                            indexed_inputs[1]['id'],
                            indexed_inputs[2]['id'],
                            indexed_inputs[3]['id'],
                            indexed_inputs[4]['id'],
                            indexed_inputs[5]['id'],
                            indexed_inputs[6]['id'],
                            indexed_inputs[7]['id'],
                            indexed_inputs[8]['id'],
                            indexed_inputs[9]['id']],
        'negative_set_ids': [indexed_inputs[10]['id'],
                             indexed_inputs[11]['id'],
                             indexed_inputs[12]['id'],
                             indexed_inputs[13]['id'],
                             indexed_inputs[14]['id'],
                             indexed_inputs[15]['id'],
                             indexed_inputs[16]['id'],
                             indexed_inputs[17]['id'],
                             indexed_inputs[18]['id'],
                             indexed_inputs[19]['id']],
        'class_to_explain': '1',
        'grad_layer': 'cls_grad',
        'random_state': 0
    }

    result = self.tcav.run_with_metadata(indexed_inputs, self.model, dataset,
                                         config=config)
    self.assertLen(result, 1)
    expected = [{
        'result': {
            'score': 0.42857142857142855,
            'cos_sim': [
                -0.1107393877916321, -0.0993967046974328, -0.2214985917242054,
                0.08132588965575606, -0.3590211572508748, 0.18708109817461333,
                0.000724498781128839, 0.09700473783330398, -0.25015742815240055,
                0.16108236033785076, -0.10283274286140846, 0.0972663321478731,
                -0.05924679176256152, -0.048499696342091746,
                -0.4357117016074766, -0.593245752003111, -0.3645147796989344,
                -0.5507605083253673, -0.27914997949782694, -0.30908550968594417,
                -0.5584676299422896, -0.16983339994284577, -0.42587740852240746,
                -0.37482298817032594
            ],
            'dot_prods': [
                -261.4389298435066, -240.23776409902007, -520.6275907607769,
                197.11495117497446, -860.6035066083074, 447.3775519523981,
                1.7341104803878409, 232.59170976304426, -586.5576327736542,
                390.2961568516803, -238.95427152619726, 234.6617547723058,
                -139.3334215524385, -114.17392512371171, -1038.149036709951,
                -1439.0663895591745, -869.3828698612926, -1342.899780229334,
                -696.569760699206, -760.9907977738051, -1332.7284530349625,
                -408.90435403478875, -998.3360993150825, -908.8111404537224
            ],
            'accuracy': 0.75
        },
        'p_val': 0.04400624968940752,
        'random_mean': 0.9642857142857143,
        'split_size': 5,
        'num_runs': 2
    }]
    testing_utils.assert_deep_almost_equal(self, expected, result)