Exemple #1
0
    def test_indexed_string_inverse_removing_tokenizer(self):
        s = 'This is a good movie. This, it is a great movie.'

        def tokenizer(string):
            return re.split(r'(?:\W+)|$', string)

        indexed_string = IndexedString(s, tokenizer)

        self.assertEqual(s, indexed_string.inverse_removing([]))
Exemple #2
0
    def test_indexed_string_inverse_removing_tokenizer(self):
        s = 'This is a good movie. This, it is a great movie.'

        def tokenizer(string):
            return re.split(r'(?:\W+)|$', string)

        indexed_string = IndexedString(s, tokenizer)

        self.assertEqual(s, indexed_string.inverse_removing([]))
Exemple #3
0
    def test_indexed_string_regex(self):
        s = 'Please, take your time. Please'
        tokenized_string = np.array(
            ['Please', ', ', 'take', ' ', 'your', ' ', 'time', '. ', 'Please'])
        inverse_vocab = ['Please', 'take', 'your', 'time']
        start_positions = [0, 6, 8, 12, 13, 17, 18, 22, 24]
        positions = [[0, 8], [2], [4], [6]]
        indexed_string = IndexedString(s)

        self.assertTrue(np.array_equal(indexed_string.as_np, tokenized_string))
        self.assertTrue(
            np.array_equal(indexed_string.string_start, start_positions))
        self.assertTrue(indexed_string.inverse_vocab == inverse_vocab)
        self.assertTrue(np.array_equal(indexed_string.positions, positions))
Exemple #4
0
    def str_to_pair_of_tuples(self, strs):
        dataframe = pd.DataFrame(columns=self.columns)
        for row in strs:
            this_pair_row = defaultdict(str)
            row = IndexedString(row, split_expression=" ")
            for attr_index in self.schema.keys():
                left_tokens = self.__indexes_of_attr(
                    row, self.__make_attr(attr_index, tuple="left"))
                right_tokens = self.__indexes_of_attr(
                    row, self.__make_attr(attr_index, tuple="right"))

                this_pair_row[self.lprefix +
                              self.schema[attr_index]] = " ".join([
                                  self.__split_attr(row.word(t))[2]
                                  for t in left_tokens
                              ])
                this_pair_row[self.rprefix +
                              self.schema[attr_index]] = " ".join([
                                  self.__split_attr(row.word(t))[2]
                                  for t in right_tokens
                              ])

            dataframe = dataframe.append(this_pair_row, ignore_index=True)
        return dataframe
Exemple #5
0
    def test_indexed_string_callable(self):
        s = 'aabbccddaa'

        def tokenizer(string):
            return [
                string[i] + string[i + 1]
                for i in range(0,
                               len(string) - 1, 2)
            ]

        tokenized_string = np.array(['aa', 'bb', 'cc', 'dd', 'aa'])
        inverse_vocab = ['aa', 'bb', 'cc', 'dd']
        start_positions = [0, 2, 4, 6, 8]
        positions = [[0, 4], [1], [2], [3]]
        indexed_string = IndexedString(s, tokenizer)

        self.assertTrue(np.array_equal(indexed_string.as_np, tokenized_string))
        self.assertTrue(
            np.array_equal(indexed_string.string_start, start_positions))
        self.assertTrue(indexed_string.inverse_vocab == inverse_vocab)
        self.assertTrue(np.array_equal(indexed_string.positions, positions))
Exemple #6
0
    def test_indexed_string_inverse_removing_regex(self):
        s = 'This is a good movie. This is a great movie'
        indexed_string = IndexedString(s)

        self.assertEqual(s, indexed_string.inverse_removing([]))
    def data_labels(self, num_samples, classifier_fn, detection=False):
        '''
        Steps of this function:
            1. generate perturbed text features and image features
            2. in a loop, 1) using these features to make instances of perturbed (text, image) pairs,
                          2) make predictions on these pairs, store labels into 'labels'
            3. concatenate text and image features, store into 'data',
                also append the original input and prediction of it
            4. calculate distances

            TODO: add object detection: first run on original image, create feature components,
                    then run on perturbed images to get corresponding value

        :param num_samples:
        :param classifier_fn:
        :param object_detection:
        :return:
        '''

        ''' 1. make text features '''
        indexed_string = IndexedString(self.text, bow=True, split_expression=r'\W+', mask_string=None)
        domain_mapper = TextDomainMapper(indexed_string)

        doc_size = indexed_string.num_words()
        sample = self.random_state.randint(1, doc_size + 1, num_samples)                        # num_samples - 1
        data_txt = np.ones((num_samples, doc_size))
        # data[0] = np.ones(doc_size)
        features_range = range(doc_size)
        inverse_data_txt = []

        ''' 1. make image features '''
        random_seed = self.random_state.randint(0, high=1000)
        segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
                                                max_dist=200, ratio=0.2,
                                                random_seed=random_seed)

        #segmentation_fn = SegmentationAlgorithm('felzenszwalb', scale=200, sigma=2, min_size=100)
        '''segmentation_fn = SegmentationAlgorithm('slic', n_segments=60, compactness=10, sigma=1,
                     start_label=1)'''

        segments = segmentation_fn(self.image)  # get segmentation
        n_img_features = np.unique(segments).shape[0]  # get num of superpixel features
        data_img = self.random_state.randint(0, 2, n_img_features * num_samples).reshape(
            (num_samples, n_img_features))
        data_img_rows = tqdm(data_img)
        imgs = []

        ''' 1. make object detection features 
        if detection:
            predictor, cfg = object_detection_predictor()
            ori_label = object_detection_obtain_label(predictor, cfg, self.image)
            num_object_detection = ori_label.shape[0]
            data_object_detection = np.zeros((num_samples,num_object_detection))'''
        
        # create fudged_image
        fudged_image = self.image.copy()
        for x in np.unique(segments):
            fudged_image[segments == x] = (
                np.mean(self.image[segments == x][:, 0]),
                np.mean(self.image[segments == x][:, 1]),
                np.mean(self.image[segments == x][:, 2]))

        # img_features[0, :] = 1  # the first sample is the full image                                # num_samples

        '''2. create data instances and make predictions'''
        labels = []
        for i, instance in enumerate(zip(sample, data_img_rows)):
            size_txt, row_img = instance

            # make text instance
            inactive = self.random_state.choice(features_range, size_txt,
                                                replace=False)
            data_txt[i, inactive] = 0
            inverse_data_txt.append(indexed_string.inverse_removing(inactive))

            # make image instance
            temp = copy.deepcopy(self.image)
            zeros = np.where(row_img == 0)[0]             # get segment numbers that are turned off in this instance
            mask = np.zeros(segments.shape).astype(bool)
            for zero in zeros:
                mask[segments == zero] = True
            temp[mask] = fudged_image[mask]

            '''if detection:
                label = object_detection_obtain_label(predictor, cfg, temp)
                label_diff = compare_labels(ori_label,label)
                data_object_detection[i] = label_diff'''
            imgs.append(temp)

            # make prediction and append result
            if len(imgs) == 10:
                preds = classifier_fn(self.pred_model, imgs, inverse_data_txt)
                labels.extend(preds)
                imgs = []
                inverse_data_txt = []

        if len(imgs) > 0:
            preds = classifier_fn(self.pred_model, imgs, inverse_data_txt)
            labels.extend(preds)

        '''3. concatenate and append features'''
        data = np.concatenate((data_txt, data_img), axis=1)

        # append the original input to the last
        orig_img_f = np.ones((n_img_features,))
        orig_txt_f = np.ones(doc_size)

        '''if detection:
            data = np.concatenate((data, data_object_detection),axis=1)
            orig_ot = np.ones(num_object_detection)
            data = np.vstack((data, np.concatenate((np.concatenate((orig_txt_f, orig_img_f)),orig_ot))))
        else:'''
        data = np.vstack((data, np.ones((data.shape[1])))) ###
            
        labels.extend(classifier_fn(self.pred_model, [self.image], [self.text]))


        '''4. compute distance# distances[:, :(doc_size-1)] *= 100
            use platt scaling t get relative importance of text and image modalities
        '''

        labels = np.array(labels, dtype=float)

        # Modify MMF source code to zero out image / text attributes
        #dummy_label_image = np.array(classifier_fn([self.image], [self.text], zero_text=True))  # zero out text
        #dummy_label_text = np.array(classifier_fn([self.image], [self.text], zero_image=True))  # zero out image

        # perform calibration
        try:
            labels_for_calib = np.array(labels[:, 0] < 0.5, dtype=float)
            calibrated = CalibratedClassifierCV(cv=3)
            calibrated.fit(data[:,:doc_size + n_img_features], labels_for_calib)

            calib_data = np.ones((3, doc_size + n_img_features), dtype=float)
            calib_data[0][:doc_size] = 0        # zero out text
            calib_data[1][doc_size:] = 0        # zero out image
            calibrated_labels = calibrated.predict_proba(calib_data)

            delta_txt = abs(calibrated_labels[-1][0] - calibrated_labels[0][0])
            delta_img = abs(calibrated_labels[-1][0] - calibrated_labels[1][0])

            ratio_txt_img = max(min(10, delta_txt/delta_img), 0.1)
        except:
            dummy_text = ""
            dummy_image = np.zeros_like(self.image)
            try:
                label_text_out = np.array(classifier_fn(self.pred_model, [self.image], [self.text], zero_text=True))  # zero out text
                label_image_out = np.array(classifier_fn(self.pred_model, [self.image], [self.text], zero_image=True))  # zero out image
            except:
                label_text_out = np.array(classifier_fn(self.pred_model, [self.image], [dummy_text]))
                label_image_out = np.array(classifier_fn(self.pred_model, [dummy_image], [self.text]))

            delta_txt = abs(labels[-1][0] - label_text_out[0][0])
            delta_img = abs(labels[-1][0] - label_image_out[0][0])
            ratio_txt_img = max(min(10, delta_txt / delta_img), 0.1)

        # calculate distances
        distances_img = sklearn.metrics.pairwise_distances(
            data[:, doc_size:],
            data[-1, doc_size:].reshape(1, -1),
            metric='cosine'
        ).ravel()

        def distance_fn(x):
            return sklearn.metrics.pairwise.pairwise_distances(
                x, x[-1], metric='cosine').ravel()

        distances_txt = distance_fn(sp.sparse.csr_matrix(data[:, :doc_size]))

        distances = 1/(1 + ratio_txt_img) * distances_img + (1 - 1/(1 + ratio_txt_img)) * distances_txt

        # As required by lime_base, make the first element of data, labels, distances the original data point
        data[0] = data[-1]
        labels[0] = labels[-1]
        distances[0] = distances[-1]

        '''if not detection:'''
        num_object_detection = 0
        ori_label = None

        return data, labels, distances, doc_size, n_img_features, \
            segments, domain_mapper, num_object_detection, ori_label, ratio_txt_img
    def explain_instance(self,
                         text_instance,
                         classifier_fn,
                         labels=(1, ),
                         top_labels=None,
                         num_features=25,
                         num_samples=5000,
                         distance_metric='cosine',
                         model_regressor=None):
        """Generates explanations for a prediction.

        First, we generate neighborhood data by randomly hiding features from
        the instance (see __data_labels_distance_mapping). We then learn
        locally weighted linear models on this neighborhood data to explain
        each of the classes in an interpretable way (see lime_base.py).

        Args:
            text_instance: raw tabular data to be explained.
            classifier_fn: classifier prediction probability function, which
                takes a list of d strings and outputs a (d, k) numpy array with
                prediction probabilities, where k is the number of classes.
                For ScikitClassifiers , this is classifier.predict_proba.
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for sample weighting,
                defaults to cosine similarity
            model_regressor: sklearn regressor to use in explanation. Defaults
            to Ridge regression in LimeBase. Must have model_regressor.coef_
            and 'sample_weight' as a parameter to model_regressor.fit()
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """

        indexed_string = (IndexedCharacters(
            text_instance, bow=self.bow, mask_string=self.mask_string)
                          if self.char_level else IndexedString(
                              text_instance,
                              bow=self.bow,
                              split_expression=self.split_expression,
                              mask_string=self.mask_string))
        domain_mapper = TextDomainMapper(
            indexed_string)  #change tabel data to string
        data, yss, distances = self.__data_labels_distances(
            indexed_string,
            classifier_fn,
            num_samples,
            distance_metric=distance_metric)
        print('data shape: ', data.shape)  #(5000, 63)
        print('yss shape: ', yss.shape)  #(5000,2)
        print('distances shape: ', distances.shape)  #(5000,)
        if self.class_names is None:
            self.class_names = [str(x) for x in range(yss[0].shape[0])]
        ret_exp = explanation.Explanation(domain_mapper=domain_mapper,
                                          class_names=self.class_names,
                                          random_state=self.random_state)
        ret_exp.predict_proba = yss[0]
        if top_labels:
            labels = np.argsort(yss[0])[-top_labels:]
            ret_exp.top_labels = list(labels)
            ret_exp.top_labels.reverse()
        for label in labels:
            (ret_exp.intercept[label], ret_exp.local_exp[label], ret_exp.score,
             ret_exp.local_pred) = self.base.explain_instance_with_data(
                 data,
                 yss,
                 distances,
                 label,
                 num_features,
                 model_regressor=model_regressor,
                 feature_selection=self.feature_selection)
        return ret_exp
Exemple #9
0
    def data_labels(self, num_samples, classifier_fn, detection=False):
        """
        Steps of this function:
            1. generate perturbed text features and image features
            2. in a loop, 1) using these features to make instances of perturbed (text, image) pairs,
                          2) make predictions on these pairs, store labels into 'labels'
            3. concatenate text and image features, store into 'data',
                also append the original input and prediction of it
            4. calculate distances
            Arguments:
                classifier_fn: classification function to give predictions for given texts and images
                num_samples: size of the neighborhood to learn the linear model
                detection: Whether object detection method is invoked, default to be false
            Return:
            data: dense num_samples * num_superpixels
            labels: prediction probabilities matrix
            distances:distance including text/image distance ratio where
            text and image distance are cosine distances between the original instance and
                    each perturbed instance (computed in the binary 'data'
                    matrix), times 100.
            doc_size: number of words in indexed string, where indexed string is the string with various indexes
            n_img_features: number of superpixels to include in explanation
            segments:2d numpy array, with the output from skimage.segmentation
            domain_mapper:Maps text feature ids to words or word-positions
            num_object_detection:number of detected objects to include in explanation
            ori_label: numpy including deteced objects in the original image
            ratio_txt_img: weight ratio between text and image features
        """

        """ 1. make text features """
        indexed_string = IndexedString(
            self.text, bow=True, split_expression=r"\W+", mask_string=None
        )
        domain_mapper = TextDomainMapper(indexed_string)

        doc_size = indexed_string.num_words()
        sample = self.random_state.randint(
            1, doc_size + 1, num_samples
        )  # num_samples - 1
        data_txt = np.ones((num_samples, doc_size))
        # data[0] = np.ones(doc_size)
        features_range = range(doc_size)
        inverse_data_txt = []

        """ 1. make image features """
        random_seed = self.random_state.randint(0, high=1000)
        segmentation_fn = SegmentationAlgorithm(
            "quickshift",
            kernel_size=4,
            max_dist=200,
            ratio=0.2,
            random_seed=random_seed,
        )

        # segmentation_fn = SegmentationAlgorithm('felzenszwalb', scale=200, sigma=2, min_size=100)
        """segmentation_fn = SegmentationAlgorithm('slic', n_segments=60, compactness=10, sigma=1,
                     start_label=1)"""

        segments = segmentation_fn(self.image)  # get segmentation
        n_img_features = np.unique(segments).shape[0]  # get num of superpixel features
        data_img = self.random_state.randint(
            0, 2, n_img_features * num_samples
        ).reshape((num_samples, n_img_features))
        data_img_rows = tqdm(data_img)
        imgs = []

        """ 1. make object detection features 
        if detection:
            predictor, cfg = object_detection_predictor()
            ori_label = object_detection_obtain_label(predictor, cfg, self.image)
            num_object_detection = ori_label.shape[0]
            data_object_detection = np.zeros((num_samples,num_object_detection))"""

        # create fudged_image
        fudged_image = self.image.copy()
        for x in np.unique(segments):
            fudged_image[segments == x] = (
                np.mean(self.image[segments == x][:, 0]),
                np.mean(self.image[segments == x][:, 1]),
                np.mean(self.image[segments == x][:, 2]),
            )

        # img_features[0, :] = 1  # the first sample is the full image                                # num_samples

        """2. create data instances and make predictions"""
        labels = []
        for i, instance in enumerate(zip(sample, data_img_rows)):
            size_txt, row_img = instance

            # make text instance
            inactive = self.random_state.choice(features_range, size_txt, replace=False)
            data_txt[i, inactive] = 0
            inverse_data_txt.append(indexed_string.inverse_removing(inactive))

            # make image instance
            temp = copy.deepcopy(self.image)
            zeros = np.where(row_img == 0)[
                0
            ]  # get segment numbers that are turned off in this instance
            mask = np.zeros(segments.shape).astype(bool)
            for zero in zeros:
                mask[segments == zero] = True
            temp[mask] = fudged_image[mask]

            """if detection:
                label = object_detection_obtain_label(predictor, cfg, temp)
                label_diff = compare_labels(ori_label,label)
                data_object_detection[i] = label_diff"""
            imgs.append(temp)

            # make prediction and append result
            if len(imgs) == 10:
                preds = classifier_fn(self.pred_model, imgs, inverse_data_txt)
                labels.extend(preds)
                imgs = []
                inverse_data_txt = []

        if len(imgs) > 0:
            preds = classifier_fn(self.pred_model, imgs, inverse_data_txt)
            labels.extend(preds)

        """3. concatenate and append features"""
        data = np.concatenate((data_txt, data_img), axis=1)

        # append the original input to the last
        orig_img_f = np.ones((n_img_features,))
        orig_txt_f = np.ones(doc_size)

        """if detection:
            data = np.concatenate((data, data_object_detection),axis=1)
            orig_ot = np.ones(num_object_detection)
            data = np.vstack((data, np.concatenate((np.concatenate((orig_txt_f, orig_img_f)),orig_ot))))
        else:"""
        data = np.vstack((data, np.ones((data.shape[1]))))  ###

        labels.extend(classifier_fn(self.pred_model, [self.image], [self.text]))

        """4. compute distance# distances[:, :(doc_size-1)] *= 100
            use platt scaling t get relative importance of text and image modalities
        """

        labels = np.array(labels, dtype=float)

        # Modify MMF source code to zero out image / text attributes
        # dummy_label_image = np.array(classifier_fn([self.image], [self.text], zero_text=True))  # zero out text
        # dummy_label_text = np.array(classifier_fn([self.image], [self.text], zero_image=True))  # zero out image

        # perform calibration
        try:
            labels_for_calib = np.array(labels[:, 0] < 0.5, dtype=float)
            calibrated = CalibratedClassifierCV(cv=3)
            calibrated.fit(data[:, : doc_size + n_img_features], labels_for_calib)

            calib_data = np.ones((3, doc_size + n_img_features), dtype=float)
            calib_data[0][:doc_size] = 0  # zero out text
            calib_data[1][doc_size:] = 0  # zero out image
            calibrated_labels = calibrated.predict_proba(calib_data)

            delta_txt = abs(calibrated_labels[-1][0] - calibrated_labels[0][0])
            delta_img = abs(calibrated_labels[-1][0] - calibrated_labels[1][0])

            ratio_txt_img = max(min(100, delta_txt / delta_img), 0.01)
        except:
            dummy_text = ""
            dummy_image = np.zeros_like(self.image)
            label_text_out = np.array(
                classifier_fn(
                    self.pred_model, [self.image], [self.text], zero_text=True
                )
            )  # zero out text
            label_image_out = np.array(
                classifier_fn(
                    self.pred_model, [self.image], [self.text], zero_image=True
                )
            )  # zero out image

            delta_txt = abs(labels[-1][0] - label_text_out[0][0])
            delta_img = abs(labels[-1][0] - label_image_out[0][0])
            ratio_txt_img = max(min(10, delta_txt / delta_img), 0.1)

        # calculate distances
        distances_img = sklearn.metrics.pairwise_distances(
            data[:, doc_size:], data[-1, doc_size:].reshape(1, -1), metric="cosine"
        ).ravel()

        def distance_fn(x):
            return sklearn.metrics.pairwise.pairwise_distances(
                x, x[-1], metric="cosine"
            ).ravel()

        distances_txt = distance_fn(sp.sparse.csr_matrix(data[:, :doc_size]))

        distances = (
            1 / (1 + ratio_txt_img) * distances_img
            + (1 - 1 / (1 + ratio_txt_img)) * distances_txt
        )

        # As required by lime_base, make the first element of data, labels, distances the original data point
        data[0] = data[-1]
        labels[0] = labels[-1]
        distances[0] = distances[-1]

        """if not detection:"""
        num_object_detection = 0
        ori_label = None

        return (
            data,
            labels,
            distances,
            doc_size,
            n_img_features,
            segments,
            domain_mapper,
            num_object_detection,
            ori_label,
            ratio_txt_img,
        )
Exemple #10
0
    def test_indexed_string_inverse_removing_regex(self):
        s = 'This is a good movie. This is a great movie'
        indexed_string = IndexedString(s)

        self.assertEqual(s, indexed_string.inverse_removing([]))