Example #1
0
    def generate(self,
                 example: JsonDict,
                 model: lit_model.Model,
                 dataset: lit_dataset.Dataset,
                 config: Optional[JsonDict] = None,
                 num_examples: int = 1) -> List[JsonDict]:
        """Use gradient to find/substitute the token with largest impact on loss."""
        # TODO(lit-team): This function is quite long. Consider breaking it
        # into small functions.
        del dataset  # Unused.

        assert model is not None, "Please provide a model for this generator."
        logging.info(r"W3lc0m3 t0 H0tFl1p \o/")
        logging.info("Original example: %r", example)

        # Find classification prediciton key.
        pred_keys = self.find_fields(model.output_spec(),
                                     types.MulticlassPreds, None)
        if len(pred_keys) == 0:  # pylint: disable=g-explicit-length-test
            # TODO(ataly): Add support for regression models.
            logging.warning("The model does not have a classification head."
                            "Cannot use HotFlip. :-(")
            return []  # Cannot generate examples.
        if len(pred_keys) > 1:
            # TODO(ataly): Use a config argument when there are multiple prediction
            # heads.
            logging.warning("Multiple classification heads found."
                            "Cannot use HotFlip. :-(")
            return []  # Cannot generate examples.
        pred_key = pred_keys[0]

        # Find gradient fields to use for HotFlip
        input_spec = model.input_spec()
        output_spec = model.output_spec()
        grad_fields = self.find_fields(output_spec, types.TokenGradients,
                                       types.Tokens)
        logging.info("Found gradient fields for HotFlip use: %s",
                     str(grad_fields))
        if len(grad_fields) == 0:  # pylint: disable=g-explicit-length-test
            logging.info("No gradient fields found. Cannot use HotFlip. :-(")
            return []  # Cannot generate examples without gradients.

        # Get model outputs.
        logging.info(
            "Performing a forward/backward pass on the input example.")
        orig_output = list(model.predict([example]))[0]
        logging.info(orig_output.keys())

        # Get model word embeddings and vocab.
        inv_vocab, embed = model.get_embedding_table()
        assert len(
            inv_vocab) == embed.shape[0], "Vocab/embeddings size mismatch."
        logging.info("Vocab size: %d, Embedding size: %r", len(inv_vocab),
                     embed.shape)

        # Get original prediction class
        orig_probabilities = orig_output[pred_key]
        orig_prediction = np.argmax(orig_probabilities)

        # Perform a flip in each sequence for which we have gradients (separately).
        # Each sequence may give rise to multiple new examples, depending on how
        # many words we flip.
        # TODO(lit-team): make configurable how many new examples are desired.
        # TODO(lit-team): use only 1 sequence as input (configurable in UI).
        new_examples = []
        for grad_field in grad_fields:
            # Get the tokens and their gradient vectors.
            token_field = output_spec[grad_field].align  # pytype: disable=attribute-error
            tokens = orig_output[token_field]
            grads = orig_output[grad_field]
            token_emb_fields = self.find_fields(output_spec,
                                                types.TokenEmbeddings,
                                                types.Tokens)
            assert len(
                token_emb_fields) == 1, "Found multiple token embeddings"
            token_embs = orig_output[token_emb_fields[0]]

            # Identify the token with the largest gradient attribution,
            # defined as the dot product between the token embedding and gradient
            # of the output wrt the embedding.
            assert token_embs.shape[0] == grads.shape[0]
            token_grad_attrs = np.sum(token_embs * grads, axis=-1)
            # Get a list of indices of input tokens, sorted by gradient attribution,
            # highest first. We will flip tokens in this order.
            sorted_by_grad_attrs = np.argsort(token_grad_attrs)[::-1]

            for i in range(min(num_examples, len(tokens))):
                token_id = sorted_by_grad_attrs[i]
                logging.info(
                    "Selected token: %s (pos=%d) with gradient attribution %f",
                    tokens[token_id], token_id, token_grad_attrs[token_id])
                token_grad = grads[token_id]

                # Take dot product with all word embeddings. Get smallest value.
                # (We are look for a replacement token that will lower the score
                # the current class, thereby increasing the chances of a label
                # flip.)
                # TODO(lit-team): Can add criteria to the winner e.g. cosine distance.
                scores = np.dot(embed, token_grad)
                winner = np.argmin(scores)
                logging.info(
                    "Replacing [%s] (pos=%d) with option %d: [%s] (id=%d)",
                    tokens[token_id], token_id, i, inv_vocab[winner], winner)

                # Create a new input to the model.
                # TODO(iftenney, bastings): enforce somewhere that this field has the
                # same name in the input and output specs.
                input_token_field = token_field
                input_text_field = input_spec[input_token_field].parent  # pytype: disable=attribute-error
                new_example = copy.deepcopy(example)
                modified_tokens = copy.copy(tokens)
                modified_tokens[token_id] = inv_vocab[winner]
                new_example[input_token_field] = modified_tokens
                # TODO(iftenney, bastings): call a model-provided detokenizer here?
                # Though in general tokenization isn't invertible and it's possible for
                # HotFlip to produce wordpiece sequences that don't correspond to any
                # input string.
                new_example[input_text_field] = " ".join(modified_tokens)

                # Predict a new label for this example.
                new_output = list(model.predict([new_example]))[0]

                # Update label if multi-class prediction.
                # TODO(lit-dev): provide a general system for handling labels on
                # generated examples.
                probabilities = new_output[pred_key]
                new_prediction = np.argmax(probabilities)
                label_key = cast(types.MulticlassPreds,
                                 output_spec[pred_key]).parent
                label_names = cast(types.MulticlassPreds,
                                   output_spec[pred_key]).vocab
                new_label = label_names[new_prediction]
                new_example[label_key] = new_label
                logging.info("Updated example with new label: %s", new_label)

                if new_prediction != orig_prediction:
                    # Hotflip found
                    new_examples.append(new_example)
                else:
                    # We make new_example as our base example and continue with more
                    # token flips.
                    example = new_example
                    tokens = modified_tokens
        return new_examples
Example #2
0
    def generate(self,
                 example: JsonDict,
                 model: lit_model.Model,
                 dataset: lit_dataset.Dataset,
                 config: Optional[JsonDict] = None,
                 num_examples: int = 1) -> List[JsonDict]:
        """Use gradient to find/substitute the token with largest impact on loss."""
        del dataset  # Unused.

        assert model is not None, "Please provide a model for this generator."
        logging.info(r"W3lc0m3 t0 H0tFl1p \o/")
        logging.info("Original example: %r", example)

        # Find gradient fields to use for HotFlip
        input_spec = model.input_spec()
        output_spec = model.output_spec()
        grad_fields = self.find_fields(output_spec)
        logging.info("Found gradient fields for HotFlip use: %s",
                     str(grad_fields))
        if len(grad_fields) == 0:  # pylint: disable=g-explicit-length-test
            logging.info("No gradient fields found. Cannot use HotFlip. :-(")
            return []  # Cannot generate examples without gradients.

        # Get model outputs.
        logging.info(
            "Performing a forward/backward pass on the input example.")
        model_output = model.predict_single(example)
        logging.info(model_output.keys())

        # Get model word embeddings and vocab.
        inv_vocab, embed = model.get_embedding_table()
        assert len(
            inv_vocab) == embed.shape[0], "Vocab/embeddings size mismatch."
        logging.info("Vocab size: %d, Embedding size: %r", len(inv_vocab),
                     embed.shape)

        # Perform a flip in each sequence for which we have gradients (separately).
        # Each sequence may give rise to multiple new examples, depending on how
        # many words we flip.
        # TODO(lit-team): make configurable how many new examples are desired.
        # TODO(lit-team): use only 1 sequence as input (configurable in UI).
        new_examples = []
        for grad_field in grad_fields:

            # Get the tokens and their gradient vectors.
            token_field = output_spec[grad_field].align  # pytype: disable=attribute-error
            tokens = model_output[token_field]
            grads = model_output[grad_field]

            # Identify the token with the largest gradient norm.
            # TODO(lit-team): consider normalizing across all grad fields or just
            # across each one individually.
            grad_norm = np.linalg.norm(grads, axis=1)
            grad_norm = grad_norm / np.sum(
                grad_norm)  # Match grad attribution value.

            # Get a list of indices of input tokens, sorted by norm, highest first.
            sorted_by_grad_norm = np.argsort(grad_norm)[::-1]

            for i in range(min(num_examples, len(tokens))):
                token_id = sorted_by_grad_norm[i]
                logging.info(
                    "Selected token: %s (pos=%d) with gradient norm %f",
                    tokens[token_id], token_id, grad_norm[token_id])
                token_grad = grads[token_id]

                # Take dot product with all word embeddings. Get largest value.
                scores = np.dot(embed, token_grad)

                # TODO(lit-team): Can add criteria to the winner e.g. cosine distance.
                winner = np.argmax(scores)
                logging.info(
                    "Replacing [%s] (pos=%d) with option %d: [%s] (id=%d)",
                    tokens[token_id], token_id, i, inv_vocab[winner], winner)

                # Create a new input to the model.
                # TODO(iftenney, bastings): enforce somewhere that this field has the
                # same name in the input and output specs.
                input_token_field = token_field
                input_text_field = input_spec[input_token_field].parent  # pytype: disable=attribute-error
                new_example = copy.deepcopy(example)
                modified_tokens = copy.copy(tokens)
                modified_tokens[token_id] = inv_vocab[winner]
                new_example[input_token_field] = modified_tokens
                # TODO(iftenney, bastings): call a model-provided detokenizer here?
                # Though in general tokenization isn't invertible and it's possible for
                # HotFlip to produce wordpiece sequences that don't correspond to any
                # input string.
                new_example[input_text_field] = " ".join(modified_tokens)

                # Predict a new label for this example.
                new_output = model.predict_single(new_example)

                # Update label if multi-class prediction.
                # TODO(lit-dev): provide a general system for handling labels on
                # generated examples.
                for pred_key, pred_type in model.output_spec().items():
                    if isinstance(pred_type, types.MulticlassPreds):
                        probabilities = new_output[pred_key]
                        prediction = np.argmax(probabilities)
                        label_key = output_spec[pred_key].parent
                        label_names = output_spec[pred_key].vocab
                        new_label = label_names[prediction]
                        new_example[label_key] = new_label
                        logging.info("Updated example with new label: %s",
                                     new_label)

                new_examples.append(new_example)

        return new_examples
Example #3
0
    def generate(self,
                 example: JsonDict,
                 model: lit_model.Model,
                 dataset: lit_dataset.Dataset,
                 config: Optional[JsonDict] = None) -> List[JsonDict]:
        """Identify minimal sets of token flips that alter the prediction."""
        del dataset  # Unused.

        config = config or {}
        num_examples = int(config.get(NUM_EXAMPLES_KEY, NUM_EXAMPLES_DEFAULT))
        max_flips = int(config.get(MAX_FLIPS_KEY, MAX_FLIPS_DEFAULT))
        tokens_to_ignore = config.get(TOKENS_TO_IGNORE_KEY,
                                      TOKENS_TO_IGNORE_DEFAULT)
        pred_key = config.get(PREDICTION_KEY, "")
        regression_thresh = float(
            config.get(REGRESSION_THRESH_KEY, REGRESSION_THRESH_DEFAULT))
        assert model is not None, "Please provide a model for this generator."

        input_spec = model.input_spec()
        output_spec = model.output_spec()
        assert pred_key, "Please provide the prediction key"
        assert pred_key in output_spec, "Invalid prediction key"

        is_regression = False
        if isinstance(output_spec[pred_key], types.RegressionScore):
            is_regression = True
        else:
            assert isinstance(output_spec[pred_key], types.MulticlassPreds), (
                "Only classification or regression models are supported")
        logging.info(r"W3lc0m3 t0 H0tFl1p \o/")
        logging.info("Original example: %r", example)

        # Get model outputs.
        orig_output = list(model.predict([example]))[0]

        # Check config for selected fields.
        selected_fields = list(config.get(FIELDS_TO_HOTFLIP_KEY, []))
        if not selected_fields:
            return []

        # Get tokens (corresponding to each text input field) and corresponding
        # gradients.
        tokens_and_gradients = self._get_tokens_and_gradients(
            input_spec, output_spec, orig_output, selected_fields)
        assert tokens_and_gradients, (
            "No token fields found. Cannot use HotFlip. :-(")

        # Copy tokens into input example.
        example = copy.deepcopy(example)
        for token_field, v in tokens_and_gradients.items():
            tokens, _ = v
            example[token_field] = tokens

        inv_vocab, embedding_matrix = model.get_embedding_table()
        assert len(inv_vocab) == embedding_matrix.shape[0], (
            "Vocab/embeddings size mismatch.")

        successful_cfs = []
        # TODO(lit-team): use only 1 sequence as input (configurable in UI).
        # TODO(lit-team): Refactor the following code so that it's not so deeply
        # nested (and easier to track loop state).
        for token_field, v in tokens_and_gradients.items():
            tokens, grads = v
            text_field = input_spec[token_field].parent  # pytype: disable=attribute-error
            logging.info("Identifying Hotflips for input field: %s",
                         str(text_field))
            direction = -1
            if is_regression:
                # We want the replacements to increase the prediction score if the
                # original score is below the threshold, and decrease otherwise.
                direction = (1 if orig_output[pred_key] <= regression_thresh
                             else -1)
            replacement_tokens = self._get_replacement_tokens(
                embedding_matrix, inv_vocab, grads, direction)

            successful_positions = []
            for token_idxs in self._gen_token_idxs_to_flip(
                    tokens, grads, max_flips, tokens_to_ignore):
                if len(successful_cfs) >= num_examples:
                    return successful_cfs
                # If a subset of the set of tokens have already been successful in
                # obtaining a flip, we continue. This ensures that we only consider
                # sets of token flips that are minimal.
                if self._subset_exists(set(token_idxs), successful_positions):
                    continue

                # Create counterfactual.
                cf = self._create_cf(example, token_field, text_field, tokens,
                                     token_idxs, replacement_tokens)
                # Obtain model prediction.
                cf_output = list(model.predict([cf]))[0]

                if cf_utils.is_prediction_flip(cf_output, orig_output,
                                               output_spec, pred_key,
                                               regression_thresh):
                    # Prediciton flip found!
                    cf_utils.update_prediction(cf, cf_output, output_spec,
                                               pred_key)
                    successful_cfs.append(cf)
                    successful_positions.append(set(token_idxs))
        return successful_cfs