def generate(self, example: JsonDict, model: lit_model.Model, dataset: lit_dataset.Dataset, config: Optional[JsonDict] = None, num_examples: int = 1) -> List[JsonDict]: """Use gradient to find/substitute the token with largest impact on loss.""" # TODO(lit-team): This function is quite long. Consider breaking it # into small functions. del dataset # Unused. assert model is not None, "Please provide a model for this generator." logging.info(r"W3lc0m3 t0 H0tFl1p \o/") logging.info("Original example: %r", example) # Find classification prediciton key. pred_keys = self.find_fields(model.output_spec(), types.MulticlassPreds, None) if len(pred_keys) == 0: # pylint: disable=g-explicit-length-test # TODO(ataly): Add support for regression models. logging.warning("The model does not have a classification head." "Cannot use HotFlip. :-(") return [] # Cannot generate examples. if len(pred_keys) > 1: # TODO(ataly): Use a config argument when there are multiple prediction # heads. logging.warning("Multiple classification heads found." "Cannot use HotFlip. :-(") return [] # Cannot generate examples. pred_key = pred_keys[0] # Find gradient fields to use for HotFlip input_spec = model.input_spec() output_spec = model.output_spec() grad_fields = self.find_fields(output_spec, types.TokenGradients, types.Tokens) logging.info("Found gradient fields for HotFlip use: %s", str(grad_fields)) if len(grad_fields) == 0: # pylint: disable=g-explicit-length-test logging.info("No gradient fields found. Cannot use HotFlip. :-(") return [] # Cannot generate examples without gradients. # Get model outputs. logging.info( "Performing a forward/backward pass on the input example.") orig_output = list(model.predict([example]))[0] logging.info(orig_output.keys()) # Get model word embeddings and vocab. inv_vocab, embed = model.get_embedding_table() assert len( inv_vocab) == embed.shape[0], "Vocab/embeddings size mismatch." logging.info("Vocab size: %d, Embedding size: %r", len(inv_vocab), embed.shape) # Get original prediction class orig_probabilities = orig_output[pred_key] orig_prediction = np.argmax(orig_probabilities) # Perform a flip in each sequence for which we have gradients (separately). # Each sequence may give rise to multiple new examples, depending on how # many words we flip. # TODO(lit-team): make configurable how many new examples are desired. # TODO(lit-team): use only 1 sequence as input (configurable in UI). new_examples = [] for grad_field in grad_fields: # Get the tokens and their gradient vectors. token_field = output_spec[grad_field].align # pytype: disable=attribute-error tokens = orig_output[token_field] grads = orig_output[grad_field] token_emb_fields = self.find_fields(output_spec, types.TokenEmbeddings, types.Tokens) assert len( token_emb_fields) == 1, "Found multiple token embeddings" token_embs = orig_output[token_emb_fields[0]] # Identify the token with the largest gradient attribution, # defined as the dot product between the token embedding and gradient # of the output wrt the embedding. assert token_embs.shape[0] == grads.shape[0] token_grad_attrs = np.sum(token_embs * grads, axis=-1) # Get a list of indices of input tokens, sorted by gradient attribution, # highest first. We will flip tokens in this order. sorted_by_grad_attrs = np.argsort(token_grad_attrs)[::-1] for i in range(min(num_examples, len(tokens))): token_id = sorted_by_grad_attrs[i] logging.info( "Selected token: %s (pos=%d) with gradient attribution %f", tokens[token_id], token_id, token_grad_attrs[token_id]) token_grad = grads[token_id] # Take dot product with all word embeddings. Get smallest value. # (We are look for a replacement token that will lower the score # the current class, thereby increasing the chances of a label # flip.) # TODO(lit-team): Can add criteria to the winner e.g. cosine distance. scores = np.dot(embed, token_grad) winner = np.argmin(scores) logging.info( "Replacing [%s] (pos=%d) with option %d: [%s] (id=%d)", tokens[token_id], token_id, i, inv_vocab[winner], winner) # Create a new input to the model. # TODO(iftenney, bastings): enforce somewhere that this field has the # same name in the input and output specs. input_token_field = token_field input_text_field = input_spec[input_token_field].parent # pytype: disable=attribute-error new_example = copy.deepcopy(example) modified_tokens = copy.copy(tokens) modified_tokens[token_id] = inv_vocab[winner] new_example[input_token_field] = modified_tokens # TODO(iftenney, bastings): call a model-provided detokenizer here? # Though in general tokenization isn't invertible and it's possible for # HotFlip to produce wordpiece sequences that don't correspond to any # input string. new_example[input_text_field] = " ".join(modified_tokens) # Predict a new label for this example. new_output = list(model.predict([new_example]))[0] # Update label if multi-class prediction. # TODO(lit-dev): provide a general system for handling labels on # generated examples. probabilities = new_output[pred_key] new_prediction = np.argmax(probabilities) label_key = cast(types.MulticlassPreds, output_spec[pred_key]).parent label_names = cast(types.MulticlassPreds, output_spec[pred_key]).vocab new_label = label_names[new_prediction] new_example[label_key] = new_label logging.info("Updated example with new label: %s", new_label) if new_prediction != orig_prediction: # Hotflip found new_examples.append(new_example) else: # We make new_example as our base example and continue with more # token flips. example = new_example tokens = modified_tokens return new_examples
def generate(self, example: JsonDict, model: lit_model.Model, dataset: lit_dataset.Dataset, config: Optional[JsonDict] = None, num_examples: int = 1) -> List[JsonDict]: """Use gradient to find/substitute the token with largest impact on loss.""" del dataset # Unused. assert model is not None, "Please provide a model for this generator." logging.info(r"W3lc0m3 t0 H0tFl1p \o/") logging.info("Original example: %r", example) # Find gradient fields to use for HotFlip input_spec = model.input_spec() output_spec = model.output_spec() grad_fields = self.find_fields(output_spec) logging.info("Found gradient fields for HotFlip use: %s", str(grad_fields)) if len(grad_fields) == 0: # pylint: disable=g-explicit-length-test logging.info("No gradient fields found. Cannot use HotFlip. :-(") return [] # Cannot generate examples without gradients. # Get model outputs. logging.info( "Performing a forward/backward pass on the input example.") model_output = model.predict_single(example) logging.info(model_output.keys()) # Get model word embeddings and vocab. inv_vocab, embed = model.get_embedding_table() assert len( inv_vocab) == embed.shape[0], "Vocab/embeddings size mismatch." logging.info("Vocab size: %d, Embedding size: %r", len(inv_vocab), embed.shape) # Perform a flip in each sequence for which we have gradients (separately). # Each sequence may give rise to multiple new examples, depending on how # many words we flip. # TODO(lit-team): make configurable how many new examples are desired. # TODO(lit-team): use only 1 sequence as input (configurable in UI). new_examples = [] for grad_field in grad_fields: # Get the tokens and their gradient vectors. token_field = output_spec[grad_field].align # pytype: disable=attribute-error tokens = model_output[token_field] grads = model_output[grad_field] # Identify the token with the largest gradient norm. # TODO(lit-team): consider normalizing across all grad fields or just # across each one individually. grad_norm = np.linalg.norm(grads, axis=1) grad_norm = grad_norm / np.sum( grad_norm) # Match grad attribution value. # Get a list of indices of input tokens, sorted by norm, highest first. sorted_by_grad_norm = np.argsort(grad_norm)[::-1] for i in range(min(num_examples, len(tokens))): token_id = sorted_by_grad_norm[i] logging.info( "Selected token: %s (pos=%d) with gradient norm %f", tokens[token_id], token_id, grad_norm[token_id]) token_grad = grads[token_id] # Take dot product with all word embeddings. Get largest value. scores = np.dot(embed, token_grad) # TODO(lit-team): Can add criteria to the winner e.g. cosine distance. winner = np.argmax(scores) logging.info( "Replacing [%s] (pos=%d) with option %d: [%s] (id=%d)", tokens[token_id], token_id, i, inv_vocab[winner], winner) # Create a new input to the model. # TODO(iftenney, bastings): enforce somewhere that this field has the # same name in the input and output specs. input_token_field = token_field input_text_field = input_spec[input_token_field].parent # pytype: disable=attribute-error new_example = copy.deepcopy(example) modified_tokens = copy.copy(tokens) modified_tokens[token_id] = inv_vocab[winner] new_example[input_token_field] = modified_tokens # TODO(iftenney, bastings): call a model-provided detokenizer here? # Though in general tokenization isn't invertible and it's possible for # HotFlip to produce wordpiece sequences that don't correspond to any # input string. new_example[input_text_field] = " ".join(modified_tokens) # Predict a new label for this example. new_output = model.predict_single(new_example) # Update label if multi-class prediction. # TODO(lit-dev): provide a general system for handling labels on # generated examples. for pred_key, pred_type in model.output_spec().items(): if isinstance(pred_type, types.MulticlassPreds): probabilities = new_output[pred_key] prediction = np.argmax(probabilities) label_key = output_spec[pred_key].parent label_names = output_spec[pred_key].vocab new_label = label_names[prediction] new_example[label_key] = new_label logging.info("Updated example with new label: %s", new_label) new_examples.append(new_example) return new_examples
def generate(self, example: JsonDict, model: lit_model.Model, dataset: lit_dataset.Dataset, config: Optional[JsonDict] = None) -> List[JsonDict]: """Identify minimal sets of token flips that alter the prediction.""" del dataset # Unused. config = config or {} num_examples = int(config.get(NUM_EXAMPLES_KEY, NUM_EXAMPLES_DEFAULT)) max_flips = int(config.get(MAX_FLIPS_KEY, MAX_FLIPS_DEFAULT)) tokens_to_ignore = config.get(TOKENS_TO_IGNORE_KEY, TOKENS_TO_IGNORE_DEFAULT) pred_key = config.get(PREDICTION_KEY, "") regression_thresh = float( config.get(REGRESSION_THRESH_KEY, REGRESSION_THRESH_DEFAULT)) assert model is not None, "Please provide a model for this generator." input_spec = model.input_spec() output_spec = model.output_spec() assert pred_key, "Please provide the prediction key" assert pred_key in output_spec, "Invalid prediction key" is_regression = False if isinstance(output_spec[pred_key], types.RegressionScore): is_regression = True else: assert isinstance(output_spec[pred_key], types.MulticlassPreds), ( "Only classification or regression models are supported") logging.info(r"W3lc0m3 t0 H0tFl1p \o/") logging.info("Original example: %r", example) # Get model outputs. orig_output = list(model.predict([example]))[0] # Check config for selected fields. selected_fields = list(config.get(FIELDS_TO_HOTFLIP_KEY, [])) if not selected_fields: return [] # Get tokens (corresponding to each text input field) and corresponding # gradients. tokens_and_gradients = self._get_tokens_and_gradients( input_spec, output_spec, orig_output, selected_fields) assert tokens_and_gradients, ( "No token fields found. Cannot use HotFlip. :-(") # Copy tokens into input example. example = copy.deepcopy(example) for token_field, v in tokens_and_gradients.items(): tokens, _ = v example[token_field] = tokens inv_vocab, embedding_matrix = model.get_embedding_table() assert len(inv_vocab) == embedding_matrix.shape[0], ( "Vocab/embeddings size mismatch.") successful_cfs = [] # TODO(lit-team): use only 1 sequence as input (configurable in UI). # TODO(lit-team): Refactor the following code so that it's not so deeply # nested (and easier to track loop state). for token_field, v in tokens_and_gradients.items(): tokens, grads = v text_field = input_spec[token_field].parent # pytype: disable=attribute-error logging.info("Identifying Hotflips for input field: %s", str(text_field)) direction = -1 if is_regression: # We want the replacements to increase the prediction score if the # original score is below the threshold, and decrease otherwise. direction = (1 if orig_output[pred_key] <= regression_thresh else -1) replacement_tokens = self._get_replacement_tokens( embedding_matrix, inv_vocab, grads, direction) successful_positions = [] for token_idxs in self._gen_token_idxs_to_flip( tokens, grads, max_flips, tokens_to_ignore): if len(successful_cfs) >= num_examples: return successful_cfs # If a subset of the set of tokens have already been successful in # obtaining a flip, we continue. This ensures that we only consider # sets of token flips that are minimal. if self._subset_exists(set(token_idxs), successful_positions): continue # Create counterfactual. cf = self._create_cf(example, token_field, text_field, tokens, token_idxs, replacement_tokens) # Obtain model prediction. cf_output = list(model.predict([cf]))[0] if cf_utils.is_prediction_flip(cf_output, orig_output, output_spec, pred_key, regression_thresh): # Prediciton flip found! cf_utils.update_prediction(cf, cf_output, output_spec, pred_key) successful_cfs.append(cf) successful_positions.append(set(token_idxs)) return successful_cfs