def interpret_sentence(model, text, text_lengths, args, label=0): # Interpretable method if 'BERT' in args.model: PAD_IND = args.bert_tokenizer.pad_token_id lig = LayerIntegratedGradients(model, model.model.embeddings) else: PAD_IND = args.TEXT.vocab.stoi['<pad>'] lig = LayerIntegratedGradients(model, model.embedding) token_reference = TokenReferenceBase(reference_token_idx=PAD_IND) model.zero_grad() # predict start = time.time() pred = model(text, text_lengths).squeeze(0) print("time:", time.time() - start) pred_ind = torch.argmax(pred).item() # generate reference indices for each sample reference_indices = token_reference.generate_reference( text.shape[1], device=args.device).unsqueeze(0) # compute attributions and approximation delta using layer integrated gradients attributions_ig_1 = lig.attribute((text, text_lengths), (reference_indices, text_lengths), target=0, n_steps=100, return_convergence_delta=False) attributions_ig_2 = lig.attribute((text, text_lengths), (reference_indices, text_lengths), target=1, n_steps=100, return_convergence_delta=False) if 'BERT' in args.model: sentence = [ args.bert_tokenizer.ids_to_tokens[int(word)] for word in text.squeeze(0).cpu().numpy() if int(word) != args.bert_tokenizer.pad_token_id ] else: sentence = [ args.TEXT.vocab.itos[int(word)] for word in text.squeeze(0).cpu().numpy() ] # print(sentence) add_attributions_to_visualizer(attributions_ig_1, sentence, pred, pred_ind, label, args) add_attributions_to_visualizer(attributions_ig_2, sentence, pred, pred_ind, label, args)
def get_attributions(model, text): """ Returns: - tokens: An array of tokens - attrs: An array of attributions, of same size as 'tokens', with attrs[i] being the attribution to tokens[i] """ # tokenize text tokenized = tokenizer.encode_plus(text, pad_to_max_length=True, max_length=512) input_ids = torch.tensor(tokenized['input_ids']).to(device) input_ids = input_ids.view((1, -1)) tokenized = [x for x in tokenized['input_ids'] if x != 0] tokenized_text = tokenizer.convert_ids_to_tokens(tokenized) lig = LayerIntegratedGradients(model, model.bert.embeddings) attributions, delta = lig.attribute(input_ids, internal_batch_size=10, return_convergence_delta=True) attributions = attributions.sum(dim=-1) attributions = attributions / torch.norm(attributions) attributions = attributions[0][:len(tokenized)] return tokenized_text, attributions, delta
class CNNPredictionModel: def __init__(self, model_path="static/models/CNN-29", seq_length=256): self.seq_length = seq_length self.initialize(model_path) def initialize(self, model_path): print("initial tokenizer...") self.tokenizer = DataIterator().tokenizer self.PAD_IND = self.tokenizer.vocab.stoi['<pad>'] self.token_reference = TokenReferenceBase( reference_token_idx=self.PAD_IND) print("initial inference model...") self.model = torch.load(model_path, map_location="cpu").eval() print("initial attribution method ... ") self.lig = LayerIntegratedGradients(self.model, self.model.embedding) def predict(self, text): words = self.tokenizer.preprocess(clean_text(text)) if len(words) < self.seq_length: words += ['<pad>'] * (self.seq_length - len(words)) elif len(words) > self.seq_length: words = words[:self.seq_length] tokens = [self.tokenizer.vocab.stoi[word] for word in words] tokens = torch.LongTensor(tokens).unsqueeze(0) reference_tokens = self.token_reference.generate_reference( self.seq_length, device='cpu').unsqueeze(0) pred = self.model(tokens) plabel = int(torch.argmax(pred, 1)) pred = pred.tolist()[0] unpad_index = [ idx for idx, word in enumerate(words) if word != '<pad>' ] unpad_words = [word for word in words if word != '<pad>'] attributions = [] for label in range(len(pred)): attributions.append( list( self.attribute(tokens, reference_tokens, label, unpad_index))) return unpad_words, pred, plabel, attributions def attribute(self, tokens, reference_tokens, target, unpad_index): attributions, delta = self.lig.attribute(tokens, reference_tokens, target=target,\ return_convergence_delta=True) attributions = attributions.sum(dim=2).squeeze(0) attributions = attributions / torch.norm(attributions) attributions = attributions.cpu().detach().numpy() unpad_attributions = attributions[unpad_index] range_limit = np.max(np.abs(unpad_attributions)) unpad_attributions /= range_limit return unpad_attributions def __repr__(self): return "prediction model for CNN" def __str__(self): return "prediction model for CNN"
class LIGAttributions(Attributions): def __init__( self, custom_forward: Callable, embeddings: nn.Module, text: str, input_ids: torch.Tensor, ref_input_ids: torch.Tensor, sep_id: int, ): super().__init__(custom_forward, embeddings, text) self.input_ids = input_ids self.ref_input_ids = ref_input_ids self.lig = LayerIntegratedGradients(self.custom_forward, self.embeddings) self._attributions, self.delta = self.lig.attribute( inputs=self.input_ids, baselines=self.ref_input_ids, return_convergence_delta=True, ) @property def word_attributions(self): wa = [] if len(self.attributions_sum) >= 1: for i, (word, attribution) in enumerate( zip(self.text.split(), self.attributions_sum) ): wa.append((word, float(attribution.data.numpy()))) return wa else: raise AttributionsNotCalculatedError("Attributions are not yet calculated") def summarize(self): self.attributions_sum = self._attributions.sum(dim=-1).squeeze(0) self.attributions_sum = self.attributions_sum / torch.norm( self.attributions_sum ) def visualize_attributions( self, pred_prob, pred_class, true_class, attr_class, text, all_tokens ): return viz.VisualizationDataRecord( self.attributions_sum, pred_prob, pred_class, true_class, attr_class, self.attributions_sum.sum(), all_tokens, self.delta, )
def attribution(self): #self.logits = self.model(self.input_ids, token_type_ids=self.token_type_ids, attention_mask=self.attention_mask, ) #self.prediction = torch.argmax(self.logits[0]) #self.sentclass_pos_forward_func = self.logits[0].max(1).values lig = LayerIntegratedGradients(self.sentclass_pos_forward_func, self.model.bert.embeddings) attributions_start, self.delta_start = lig.attribute(inputs = self.input_ids, baselines = self.ref_input_ids, additional_forward_args=(self.token_type_ids, self.attention_mask), return_convergence_delta=True) attributions_start = attributions_start.sum(dim=-1).squeeze(0) self.attributions_start_summary = attributions_start / torch.norm(attributions_start) #self.attributions_start_summary = self.attributions_start_summary.detach().tolist() return self.attributions_start_summary
def main(cfg): # Initialize the dataset blastchar_dataset = BlastcharDataset(cfg.dataset.path) NUM_CATEGORICAL_COLS = blastchar_dataset.num_categorical_cols NUM_CONTINIOUS_COLS = blastchar_dataset.num_continious_cols EMBED_DIM = 32 # initialize the model with its arguments mlp = nn.Sequential( nn.Linear(NUM_CATEGORICAL_COLS * EMBED_DIM + NUM_CONTINIOUS_COLS, 50), nn.ReLU(), nn.BatchNorm1d(50), nn.Dropout(cfg.params.dropout), nn.Linear(50, 20), nn.ReLU(), nn.BatchNorm1d(20), nn.Dropout(cfg.params.dropout), nn.Linear(20, blastchar_dataset.num_classes)) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = TabTransformer(blastchar_dataset.num_categories, mlp, embed_dim=EMBED_DIM, num_cont_cols=NUM_CONTINIOUS_COLS) model.load_state_dict(torch.load(cfg.params.weights), strict=False) model = model.to(device) model.eval() model = ModelInputWrapper(model) cat, cont, _ = blastchar_dataset[0] cat, cont = cat.unsqueeze(0).long(), cont.unsqueeze(0).float() cat = torch.cat((cat, cat), dim=0) cont = torch.cat((cont, cont), dim=0) input = (cat, cont) outs = model(*input) preds = outs.argmax(-1) attr = LayerIntegratedGradients( model, [model.module.embed, model.module.layer_norm]) attributions, _ = attr.attribute( inputs=(cat, cont), baselines=(torch.zeros_like(cat, dtype=torch.long), torch.zeros_like(cont, dtype=torch.float32)), target=preds.detach(), n_steps=30, return_convergence_delta=True) print(f'attributions: {attributions[0].shape, attributions[1].shape}') pprint(torch.cat((attributions[0].sum(dim=2), attributions[1]), dim=1))
def attribute_integrated_gradients( text_input_ids: torch.Tensor, ref_input_ids: torch.Tensor, target: int, model: BertForSequenceClassification, **kwargs) -> Tuple[np.ndarray, Dict[str, float]]: def forward(model_input): pred = model(model_input) return torch.softmax(pred[0], dim=1) lig = LayerIntegratedGradients(forward, model.bert.embeddings) attributions, delta = lig.attribute(inputs=text_input_ids, target=target, baselines=ref_input_ids, return_convergence_delta=True) scores = attributions.sum(dim=-1).squeeze(0) scores = scores.cpu().detach().numpy() return scores, {"delta": delta.item()}
def lig_explain(inputs: Any, target: int, forward: Callable, embedding_layer: nn.Module) -> torch.Tensor: """Interpretability algorithm (Integrated Gradients) that assigns an importance score to each input token Args: inputs: Input for token embedding layer. target (int): Index of label for interpretation. forward (Callable): The forward function of the model or any modification of it. embedding_layer: Token embedding layer for which attributions are computed. Returns: Tensor of importance score to each input token """ lig = LayerIntegratedGradients(forward, embedding_layer) attributions = lig.attribute(inputs, target=target) attributions = reduce_embedding_attributions(attributions) return attributions
def get_scores_and_attributions(self, inputs, tok_e1_idx, tok_e2_idx, str_label): input_ids, attention_mask = inputs["input_ids"], \ inputs["attention_mask"] input_ids_tensor, ref_input_ids_tensor = self._construct_input_ref_pair(input_ids) #token_type_ids_tensor, ref_token_type_ids_tensor = self._construct_input_ref_token_type_pair(token_type_ids) attention_mask_tensor = torch.tensor([attention_mask],device=self.device) e1_pos_tensor = torch.tensor([tok_e1_idx], device=self.device) e2_pos_tensor = torch.tensor([tok_e2_idx], device=self.device) labels_tensor = torch.tensor([CLASSES.index(str_label)], device=self.device) indices = input_ids_tensor[0].detach().tolist() all_tokens = self.tokenizer.convert_ids_to_tokens(indices) _, pred_scores, _ = self.predict(input_ids_tensor, #token_type_ids=token_type_ids_tensor, attention_mask=attention_mask_tensor, labels=labels_tensor, e1_pos=e1_pos_tensor, e2_pos=e2_pos_tensor) lig = LayerIntegratedGradients(self.trc_forward_func, self.model.roberta.embeddings) attributions, delta = lig.attribute(inputs=input_ids_tensor, baselines=ref_input_ids_tensor, additional_forward_args=(None,#token_type_ids_tensor, attention_mask_tensor, labels_tensor, e1_pos_tensor, e2_pos_tensor), return_convergence_delta=True) attributions_sum = summarize_attributions(attributions) return pred_scores, all_tokens, attributions_sum, delta
def run_models(model, model_name, num_trials, subset, tokenized_list, device): if model_name == "bert": layer_interm = LayerIntermediateGradients(bert_sequence_forward_func, model.bert.embeddings) lig = LayerIntegratedGradients(bert_sequence_forward_func, model.bert.embeddings) elif model_name == "xlnet": layer_interm = LayerIntermediateGradients( xlnet_sequence_forward_func, model.transformer.batch_first) lig = LayerIntegratedGradients(xlnet_sequence_forward_func, model.transformer.batch_first) run_through_example = tokenized_list[-1] tokenized_list = tokenized_list[:subset] input_ids = run_through_example["input_ids"].to(device) token_type_ids = run_through_example["token_type_ids"].to(device) attention_mask = run_through_example["attention_mask"].to(device) baseline_ids = run_through_example["baseline_ids"].to(device) grads, step_sizes, intermediates = layer_interm.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), target=1, n_steps=50) # maybe pass n_steps as CLI argument integrated_grads = lig.attribute(inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), target=1, n_steps=50) for repetition in tqdm(range(num_trials)): start_time = time.perf_counter() for feature in tokenized_list: input_ids = feature["input_ids"].to(device) token_type_ids = feature["token_type_ids"].to(device) attention_mask = feature["attention_mask"].to(device) baseline_ids = feature["baseline_ids"].to(device) grads, step_sizes, intermediates = layer_interm.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), target=1, n_steps=50) # maybe pass n_steps as CLI argument integrated_grads = lig.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), target=1, n_steps=50) end_time = time.perf_counter() elapsed_time = end_time - start_time print( "Repetition %s Elapsed Time for %s examples: " % (repetition, subset), elapsed_time)
class LIGAttributions(Attributions): def __init__( self, custom_forward: Callable, embeddings: nn.Module, tokens: list, input_ids: torch.Tensor, ref_input_ids: torch.Tensor, sep_id: int, attention_mask: torch.Tensor, token_type_ids: torch.Tensor = None, position_ids: torch.Tensor = None, ref_token_type_ids: torch.Tensor = None, ref_position_ids: torch.Tensor = None, ): super().__init__(custom_forward, embeddings, tokens) self.input_ids = input_ids self.ref_input_ids = ref_input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.position_ids = position_ids self.ref_token_type_ids = ref_token_type_ids self.ref_position_ids = ref_position_ids self.lig = LayerIntegratedGradients(self.custom_forward, self.embeddings) if self.token_type_ids is not None and self.position_ids is not None: self._attributions, self.delta = self.lig.attribute( inputs=(self.input_ids, self.token_type_ids, self.position_ids), baselines=( self.ref_input_ids, self.ref_token_type_ids, self.ref_position_ids, ), return_convergence_delta=True, additional_forward_args=(self.attention_mask), ) elif self.position_ids is not None: self._attributions, self.delta = self.lig.attribute( inputs=(self.input_ids, self.position_ids), baselines=( self.ref_input_ids, self.ref_position_ids, ), return_convergence_delta=True, additional_forward_args=(self.attention_mask), ) elif self.token_type_ids is not None: self._attributions, self.delta = self.lig.attribute( inputs=(self.input_ids, self.token_type_ids), baselines=( self.ref_input_ids, self.ref_token_type_ids, ), return_convergence_delta=True, additional_forward_args=(self.attention_mask), ) else: self._attributions, self.delta = self.lig.attribute( inputs=self.input_ids, baselines=self.ref_input_ids, return_convergence_delta=True, ) @property def word_attributions(self) -> list: wa = [] if len(self.attributions_sum) >= 1: for i, (word, attribution) in enumerate( zip(self.tokens, self.attributions_sum) ): wa.append((word, float(attribution.cpu().data.numpy()))) return wa else: raise AttributionsNotCalculatedError("Attributions are not yet calculated") def summarize(self): self.attributions_sum = self._attributions.sum(dim=-1).squeeze(0) self.attributions_sum = self.attributions_sum / torch.norm( self.attributions_sum ) def visualize_attributions( self, pred_prob, pred_class, true_class, attr_class, all_tokens ): return viz.VisualizationDataRecord( self.attributions_sum, pred_prob, pred_class, true_class, attr_class, self.attributions_sum.sum(), all_tokens, self.delta, )
def captum_interactive(request): if request.method == 'POST': STORED_POSTS = request.session.get("TextAttackResult") form = CustomData(request.POST) if form.is_valid(): input_text, model_name, recipe_name = form.cleaned_data[ 'input_text'], form.cleaned_data[ 'model_name'], form.cleaned_data['recipe_name'] found = False if STORED_POSTS: JSON_STORED_POSTS = json.loads(STORED_POSTS) for idx, el in enumerate(JSON_STORED_POSTS): if el["type"] == "captum" and el[ "input_string"] == input_text: tmp = JSON_STORED_POSTS.pop(idx) JSON_STORED_POSTS.insert(0, tmp) found = True break if found: request.session["TextAttackResult"] = json.dumps( JSON_STORED_POSTS[:10]) return HttpResponseRedirect(reverse('webdemo:index')) original_model = transformers.AutoModelForSequenceClassification.from_pretrained( "textattack/" + model_name) original_tokenizer = textattack.models.tokenizers.AutoTokenizer( "textattack/" + model_name) model = textattack.models.wrappers.HuggingFaceModelWrapper( original_model, original_tokenizer) device = torch.device( "cuda:2" if torch.cuda.is_available() else "cpu") clone = deepcopy(model) clone.model.to(device) def calculate(input_ids, token_type_ids, attention_mask): return clone.model(input_ids, token_type_ids, attention_mask)[0] attack = textattack.commands.attack.attack_args_helpers.parse_attack_from_args( Args(model_name, recipe_name)) attacked_text = textattack.shared.attacked_text.AttackedText( input_text) attack.goal_function.init_attack_example(attacked_text, 1) goal_func_result, _ = attack.goal_function.get_result( attacked_text) result = next( attack.attack_dataset([(input_text, goal_func_result.output)])) result_parsed = result.str_lines() if len(result_parsed) < 3: return HttpResponseNotFound('Failed') output_text = result_parsed[2] attacked_text_out = textattack.shared.attacked_text.AttackedText( output_text) orig = result.original_text() pert = result.perturbed_text() encoded = model.tokenizer.batch_encode([orig]) batch_encoded = captum_form(encoded, device) x = calculate(**batch_encoded) pert_encoded = model.tokenizer.batch_encode([pert]) pert_batch_encoded = captum_form(pert_encoded, device) x_pert = calculate(**pert_batch_encoded) lig = LayerIntegratedGradients(calculate, clone.model.bert.embeddings) attributions, delta = lig.attribute( inputs=batch_encoded['input_ids'], additional_forward_args=(batch_encoded['token_type_ids'], batch_encoded['attention_mask']), n_steps=10, target=torch.argmax(calculate(**batch_encoded)).item(), return_convergence_delta=True) attributions_pert, delta_pert = lig.attribute( inputs=pert_batch_encoded['input_ids'], additional_forward_args=(pert_batch_encoded['token_type_ids'], pert_batch_encoded['attention_mask']), n_steps=10, target=torch.argmax(calculate(**pert_batch_encoded)).item(), return_convergence_delta=True) orig = original_tokenizer.tokenizer.tokenize(orig) pert = original_tokenizer.tokenizer.tokenize(pert) atts = attributions.sum(dim=-1).squeeze(0) atts = atts / torch.norm(atts) atts_pert = attributions_pert.sum(dim=-1).squeeze(0) atts_pert = atts_pert / torch.norm(atts) all_tokens = original_tokenizer.tokenizer.convert_ids_to_tokens( batch_encoded['input_ids'][0]) all_tokens_pert = original_tokenizer.tokenizer.convert_ids_to_tokens( pert_batch_encoded['input_ids'][0]) v = viz.VisualizationDataRecord(atts[:45].detach().cpu(), torch.max(x).item(), torch.argmax(x, dim=1).item(), goal_func_result.output, 2, atts.sum().detach(), all_tokens[:45], delta) v_pert = viz.VisualizationDataRecord( atts_pert[:45].detach().cpu(), torch.max(x_pert).item(), torch.argmax(x_pert, dim=1).item(), goal_func_result.output, 2, atts_pert.sum().detach(), all_tokens_pert[:45], delta_pert) formattedHTML = formatDisplay([v, v_pert]) post = { "type": "captum", "input_string": input_text, "model_name": model_name, "recipe_name": recipe_name, "output_string": output_text, "html_input_string": formattedHTML[0], "html_output_string": formattedHTML[1], } if STORED_POSTS: JSON_STORED_POSTS = json.loads(STORED_POSTS) JSON_STORED_POSTS.insert(0, post) request.session["TextAttackResult"] = json.dumps( JSON_STORED_POSTS[:10]) else: request.session["TextAttackResult"] = json.dumps([post]) return HttpResponseRedirect(reverse('webdemo:index')) else: return HttpResponseNotFound('Failed') return HttpResponse('Success') return HttpResponseNotFound('<h1>Not Found</h1>')
def run_models(model_name, model, tokenizer, sequence, device, baseline): """ Run Integrated and Intermediate gradients on the model layer. Parameters ---------- model_name: str Name of the model that is being run. Currently supported are "Bert" or "XLNet" model: torch.nn.Module Module to run tokenizer: transformers.tokenizer Tokenizer to process the sequence and produce the input ids sequence: str Sequence to get the gradients from. device: torch.device Device that models are stored on. baseline: str Baseline to run with integrated gradients. Currently supported are 'zero', 'pad', 'unk', 'rand-norm', 'rand-unif', and 'period'. Returns ------- gradients_dict: dict Dictionary containing the gradient tensors with the following keys: "integrated_gradients", "intermediate_gradients", "step_sizes", and "intermediates". """ features = prepare_input(sequence, tokenizer) input_ids = features["input_ids"].to(device) token_type_ids = features["token_type_ids"].to(device) attention_mask = features["attention_mask"].to(device) # set up gradients and the baseline ids if model_name == "bert": layer_interm = LayerIntermediateGradients(bert_sequence_forward_func, model.bert.embeddings) lig = LayerIntegratedGradients(bert_sequence_forward_func, model.bert.embeddings) baseline_ids = generate_bert_baselines(baseline, input_ids, tokenizer).to(device) elif model_name == "xlnet": layer_interm = LayerIntermediateGradients( xlnet_sequence_forward_func, model.transformer.batch_first ) lig = LayerIntegratedGradients(xlnet_sequence_forward_func, model.transformer.batch_first) baseline_ids = generate_xlnet_baselines(baseline, input_ids, tokenizer).to(device) grads, step_sizes, intermediates = layer_interm.attribute(inputs=input_ids, baselines=baseline_ids, additional_forward_args=( model, token_type_ids, attention_mask ), target=1, n_steps=50) # maybe pass n_steps as CLI argument integrated_grads = lig.attribute(inputs=input_ids, baselines=baseline_ids, additional_forward_args=( model, token_type_ids, attention_mask ), target=1, n_steps=50) grads_dict = {"intermediate_grads": grads.to("cpu"), "step_sizes": step_sizes.to("cpu"), "intermediates": intermediates.to("cpu"), "integrated_grads": integrated_grads.to("cpu")} return grads_dict
def main(model_path, n_steps=50): #pylint: disable=missing-docstring, too-many-locals n_steps = int(n_steps) # load the model and tokenizer model = load_deprecated_model(str(model_path)) tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # tokenize the sentence for classification sequence = """Some might not find it to totally be like Pokémon without Ash. But this is definitely a Pokémon movie and way better than most of their animated movies. The CGI nailed the looks of all the Pokémon creatures and their voices. The movie is charming, funny and fun as well. They did a great job introducing this world to the big screen. I definitely want more.""" input_ids, token_type_ids, attention_mask = prepare_input( sequence, tokenizer) # create a baseline of zeros in the same shape as the inputs baseline_ids = torch.zeros(input_ids.shape, dtype=torch.int64) #change following to intermediate gradients # create an instance of layer intermediate gradients based upon the embedding layer lig = LayerIntermediateGradients(sequence_forward_func, model.bert.embeddings) grads, step_sizes = lig.attribute(inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), n_steps=n_steps) print("Shape of the returned gradients: ") print(grads.shape) print("Shape of the step sizes: ") print(step_sizes.shape) # now calculate attributions from the intermediate gradients # multiply by the step sizes scaled_grads = grads.view(n_steps, -1) * step_sizes # reshape and sum along the num_steps dimension scaled_grads = torch.sum(scaled_grads.reshape((n_steps, 1) + grads.shape[1:]), dim=0) # pass forward the input and baseline ids for reference forward_input_ids = model.bert.embeddings.forward(input_ids) forward_baseline_ids = model.bert.embeddings.forward(baseline_ids) # multiply the scaled gradients by the difference of inputs and baselines to obtain attributions attributions = scaled_grads * (forward_input_ids - forward_baseline_ids) print("Attributions calculated from intermediate gradients: ") print(attributions.shape) print(attributions) # compare to layer integrated gradients layer_integrated = LayerIntegratedGradients(sequence_forward_func, model.bert.embeddings) attrs = layer_integrated.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), n_steps=n_steps, return_convergence_delta=False) print("Attributions from layer integrated gradients: ") print(attrs.shape) print(attrs)
class TransformersSeqClassifierHandler(BaseHandler, ABC): """ Transformers handler class for sequence, token classification and question answering. """ def __init__(self): super(TransformersSeqClassifierHandler, self).__init__() self.initialized = False def initialize(self, ctx): """In this initialize function, the BERT model is loaded and the Layer Integrated Gradients Algorithmfor Captum Explanations is initialized here. Args: ctx (context): It is a JSON Object containing information pertaining to the model artefacts parameters. """ self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest["model"]["serializedFile"] model_pt_path = os.path.join(model_dir, serialized_file) self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") # read configs for the mode, model_name, etc. from setup_config.json setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning("Missing the setup_config.json file.") # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode # further setup config can be added. if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) elif self.setup_config["save_mode"] == "pretrained": if self.setup_config["mode"] == "sequence_classification": self.model = AutoModelForSequenceClassification.from_pretrained( model_dir) elif self.setup_config["mode"] == "question_answering": self.model = AutoModelForQuestionAnswering.from_pretrained( model_dir) elif self.setup_config["mode"] == "token_classification": self.model = AutoModelForTokenClassification.from_pretrained( model_dir) else: logger.warning("Missing the operation mode.") else: logger.warning("Missing the checkpoint or state_dict.") if any(fname for fname in os.listdir(model_dir) if fname.startswith("vocab.") and os.path.isfile(fname)): self.tokenizer = AutoTokenizer.from_pretrained( model_dir, do_lower_case=self.setup_config["do_lower_case"]) else: self.tokenizer = AutoTokenizer.from_pretrained( self.setup_config["model_name"], do_lower_case=self.setup_config["do_lower_case"], ) self.model.to(self.device) self.model.eval() logger.info("Transformer model from path %s loaded successfully", model_dir) # Read the mapping file, index to object name mapping_file_path = os.path.join(model_dir, "index_to_name.json") # Question answering does not need the index_to_name.json file. if not self.setup_config["mode"] == "question_answering": if os.path.isfile(mapping_file_path): with open(mapping_file_path) as f: self.mapping = json.load(f) else: logger.warning("Missing the index_to_name.json file.") # ------------------------------- Captum initialization ----------------------------# self.lig = LayerIntegratedGradients(captum_sequence_forward, self.model.bert.embeddings) self.initialized = True def preprocess(self, requests): """Basic text preprocessing, based on the user's chocie of application mode. Args: requests (str): The Input data in the form of text is passed on to the preprocess function. Returns: list : The preprocess function returns a list of Tensor for the size of the word tokens. """ input_batch = None for idx, data in enumerate(requests): input_text = data.get("data") if input_text is None: input_text = data.get("body") if isinstance(input_text, (bytes, bytearray)): input_text = input_text.decode('utf-8') max_length = self.setup_config["max_length"] logger.info("Received text: '%s'", input_text) # preprocessing text for sequence_classification and token_classification. if self.setup_config[ "mode"] == "sequence_classification" or self.setup_config[ "mode"] == "token_classification": inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt') # preprocessing text for question_answering. elif self.setup_config["mode"] == "question_answering": # TODO Reading the context from a pickeled file or other fromats that # fits the requirements of the task in hand. If this is done then need to # modify the following preprocessing accordingly. # the sample text for question_answering in the current version # should be formated as dictionary with question and text as keys # and related text as values. # we use this format here seperate question and text for encoding. question_context = ast.literal_eval(input_text) question = question_context["question"] context = question_context["context"] inputs = self.tokenizer.encode_plus(question, context, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) if input_ids.shape is not None: if input_batch is None: input_batch = input_ids else: input_batch = torch.cat((input_batch, input_ids), 0) return input_batch def inference(self, input_batch): """Predict the class (or classes) of the received text using the serialized transformers checkpoint. Args: input_batch (list): List of Text Tensors from the pre-process function is passed here Returns: list : It returns a list of the predicted value for the input text """ inferences = [] # Handling inference for sequence_classification. if self.setup_config["mode"] == "sequence_classification": predictions = self.model(input_batch) print("This the output size from the Seq classification model", predictions[0].size()) print("This the output from the Seq classification model", predictions) num_rows, num_cols = predictions[0].shape for i in range(num_rows): out = predictions[0][i].unsqueeze(0) y_hat = out.argmax(1).item() predicted_idx = str(y_hat) inferences.append(self.mapping[predicted_idx]) # Handling inference for question_answering. elif self.setup_config["mode"] == "question_answering": # the output should be only answer_start and answer_end # we are outputing the words just for demonstration. answer_start_scores, answer_end_scores = self.model(input_batch) print( "This the output size for answer start scores from the question answering model", answer_start_scores.size()) print( "This the output for answer start scores from the question answering model", answer_start_scores) print( "This the output size for answer end scores from the question answering model", answer_end_scores.size()) print( "This the output for answer end scores from the question answering model", answer_end_scores) num_rows, num_cols = answer_start_scores.shape # inferences = [] for i in range(num_rows): answer_start_scores_one_seq = answer_start_scores[i].unsqueeze( 0) answer_start = torch.argmax(answer_start_scores_one_seq) answer_end_scores_one_seq = answer_end_scores[i].unsqueeze(0) answer_end = torch.argmax(answer_end_scores_one_seq) + 1 prediction = self.tokenizer.convert_tokens_to_string( self.tokenizer.convert_ids_to_tokens( input_batch[i].tolist()[answer_start:answer_end])) inferences.append(prediction) logger.info("Model predicted: '%s'", prediction) # Handling inference for token_classification. elif self.setup_config["mode"] == "token_classification": outputs = self.model(input_batch)[0] print("This the output size from the token classification model", outputs.size()) print("This the output from the token classification model", outputs) num_rows = outputs.shape[0] for i in range(num_rows): output = outputs[i].unsqueeze(0) predictions = torch.argmax(output, dim=2) tokens = self.tokenizer.tokenize( self.tokenizer.decode(input_batch[i])) if self.mapping: label_list = self.mapping["label_list"] label_list = label_list.strip('][').split(', ') prediction = [(token, label_list[prediction]) for token, prediction in zip( tokens, predictions[0].tolist())] inferences.append(prediction) logger.info("Model predicted: '%s'", prediction) return inferences def postprocess(self, inference_output): """Post Process Function converts the predicted response into Torchserve readable format. Args: inference_output (list): It contains the predicted response of the input text. Returns: (list): Returns a list of the Predictions and Explanations. """ return inference_output def get_insights(self, input_batch, text, target): """This function calls the layer integrated gradient to get word importance of the input text Args: input_batch (int): Batches of tokens IDs of text text (str): The Text specified in the input request target (int): The Target can be set to any acceptable label under the user's discretion. Returns: (list): Returns a list of importances and words. """ if isinstance(text, (bytes, bytearray)): text = text.decode('utf-8') input_ids, ref_input_ids, attention_mask = construct_input_ref( text, self.tokenizer, self.device) all_tokens = get_word_token(input_ids, self.tokenizer) attributions, delta = self.lig.attribute( inputs=input_ids, baselines=ref_input_ids, target=self.target, additional_forward_args=(attention_mask, 0, self.model), return_convergence_delta=True, ) attributions_sum = summarize_attributions(attributions) response = {} response["importances"] = attributions_sum.tolist() response["words"] = all_tokens response["delta"] = delta[0].tolist() return [response]
def main(model_path, n_steps=50): #pylint: disable=missing-docstring, too-many-locals # disable warning messages for initial pretrained XLNet module. logging.basicConfig(level=logging.ERROR) n_steps = int(n_steps) # load the model and tokenizer model = load_model(str(model_path), device=torch.device("cpu")) tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') # tokenize the sentence for classification sequence = """Some might not find it to totally be like Pokémon without Ash. But this is definitely a Pokémon movie and way better than most of their animated movies. The CGI nailed the looks of all the Pokémon creatures and their voices. The movie is charming, funny and fun as well. They did a great job introducing this world to the big screen. I definitely want more.""" features = prepare_input(sequence, tokenizer) input_ids = features["input_ids"] token_type_ids = features["token_type_ids"] attention_mask = features["attention_mask"] # create a baseline of zeros in the same shape as the inputs baseline_ids = torch.zeros(input_ids.shape, dtype=torch.int64) # instance of layer intermediate gradients based upon the dummy layer representing the embeddings lig = LayerIntermediateGradients(sequence_forward_func, model.transformer.batch_first) grads, step_sizes, intermediates = lig.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), target=1, n_steps=n_steps) print("Shape of the returned gradients: ") print(grads.shape) print("Shape of the step sizes: ") print(step_sizes.shape) # now calculate attributions from the intermediate gradients # multiply by the step sizes scaled_grads = grads.view(n_steps, -1) * step_sizes # reshape and sum along the num_steps dimension scaled_grads = torch.sum(scaled_grads.reshape((n_steps, 1) + grads.shape[1:]), dim=0) # pass forward the input and baseline ids for reference forward_input_ids = model.transformer.word_embedding.forward(input_ids) forward_baseline_ids = model.transformer.word_embedding.forward( baseline_ids) # multiply the scaled gradients by the difference of inputs and baselines to obtain attributions attributions = scaled_grads * (forward_input_ids - forward_baseline_ids) print("Attributions calculated from intermediate gradients: ") print(attributions.shape) print(attributions) # compare to layer integrated gradients layer_integrated = LayerIntegratedGradients(sequence_forward_func, model.transformer.batch_first) attrs = layer_integrated.attribute( inputs=input_ids, baselines=baseline_ids, additional_forward_args=(model, token_type_ids, attention_mask), n_steps=n_steps, target=1, return_convergence_delta=False) print("Attributions from layer integrated gradients: ") print(attrs.shape) print(attrs) print("Intermediate tensor shape: ", intermediates.shape) print("Intermediate tensor: ", intermediates)
for token_index in np.where(batch_labels_ndarray[0] != 0)[0]: if out_label_ids[example_index][token_index] != pad_token_label_id: label_id = example_preds[0][token_index] true_label = batch_labels[0][token_index].item() target = (token_index, label_id) logger.info( f'Calculating attribution for label {label_id} at index {token_index}' ) attribution_start = time.time() # attributions, delta = explainer.attribute(input_embeddings, target=target, # additional_forward_args=batch_labels, # return_convergence_delta=True) attributions, delta = explainer.attribute( input_embeddings, target=target, additional_forward_args=(model, batch_labels), return_convergence_delta=True) attribution_end = time.time() attribution_duration = round( attribution_end - attribution_start, 2) logger.info( f'Attribution for label {label_id} took {attribution_duration} seconds' ) attributions = attributions.sum(dim=2).squeeze(0) attributions_sum = attributions / torch.norm(attributions) example_attrs.append({ 'attributions': attributions_sum, 'delta': delta, 'token_index': token_index, 'label_id': label_id,
def captum_text_interpreter(text, model, bpetokenizer, idx2label, max_len=80, tokenizer=None, multiclass=False): if type(text) == list: text = " ".join(text) d = data_utils.process_data_for_transformers(text, bpetokenizer, tokenizer, 0) d = { "ids": torch.tensor([d['ids']], dtype=torch.long), "mask": torch.tensor([d['mask']], dtype=torch.long), "token_type_ids": torch.tensor([d['token_type_ids']], dtype=torch.long) } try: orig_tokens = [0] + bpetokenizer.encode(text).ids + [2] orig_tokens = [bpetokenizer.id_to_token(j) for j in orig_tokens] except: orig_tokens = tokenizer.tokenize(text, add_special_tokens=True) model.eval() if multiclass: preds_proba = torch.sigmoid( model(d["ids"], d["mask"], d["token_type_ids"])).detach().cpu().numpy() preds = preds_proba.argmax(-1) preds_proba = preds_proba[0][preds[0][0]] predicted_class = idx2label[preds[0][0]] else: preds_proba = torch.sigmoid( model(d["ids"], d["mask"], d["token_type_ids"])).detach().cpu().numpy() preds = np.round(preds_proba) preds_proba = preds_proba[0][0] predicted_class = idx2label[preds[0][0]] lig = LayerIntegratedGradients(model, model.base_model.embeddings) reference_indices = [0] + [1] * (d["ids"].shape[1] - 2) + [2] reference_indices = torch.tensor([reference_indices], dtype=torch.long) attributions_ig, delta = lig.attribute(inputs=d["ids"],baselines=reference_indices,additional_forward_args=(d["mask"],d["token_type_ids"]), \ return_convergence_delta=True) attributions = attributions_ig.sum(dim=2).squeeze(0) attributions = attributions / torch.norm(attributions) attributions = attributions.detach().cpu().numpy() visualization.visualize_text([ visualization.VisualizationDataRecord(word_attributions=attributions, pred_prob=preds_proba, pred_class=predicted_class, true_class=predicted_class, attr_class=predicted_class, attr_score=attributions.sum(), raw_input=orig_tokens, convergence_score=delta) ])
def compute_and_output_attributions( outcome='top_level' ): import pickle print ('Loading data ...') if outcome == 'top_level': prepared_data_file = PREPARED_DATA_FILE_top_level elif outcome == 'mn_avg_eb': prepared_data_file = PREPARED_DATA_FILE_mn_avg_eb elif outcome == 'mn_avg_eb_adv': prepared_data_file = PREPARED_DATA_FILE_mn_avg_eb_adv elif outcome == 'perwht': prepared_data_file = PREPARED_DATA_FILE_perwht elif outcome == 'perfrl': prepared_data_file = PREPARED_DATA_FILE_perfrl else: prepared_data_file = PREPARED_DATA_FILE_mn_grd_eb df = pd.read_csv(RAW_DATA_FILE) with open(prepared_data_file, 'rb') as f: all_input_ids, labels_target, attention_masks, sentences_per_school, url, perwht, perfrl, share_singleparent, totenrl, share_collegeplus, mail_returnrate = pickle.load(f, encoding='latin1') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print ('Loading model ...') model, BEST_MODEL_DIR = get_best_model(outcome) model.to(device) model.zero_grad() # load tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Define wrapper function for integrated gradients def bert_forward_wrapper(input_ids, num_sentences, attention_mask=None, position=0): return model(input_ids, num_sentences, attention_mask=attention_mask) from captum.attr import TokenReferenceBase from captum.attr import IntegratedGradients, LayerIntegratedGradients from captum.attr import visualization as viz # We only want to compute IG over schools in our validation set data_splits = ['validation'] all_summarized_attr = [] input_ids_for_attr = [] count = 0 internal_batch_size = 12 n_steps = 48 OUTPUT_DIR = '{}interp/attributions/{}/' OUTPUT_FILE = OUTPUT_DIR + '{}_{}_loss_{}.json' if not os.path.exists(OUTPUT_DIR.format(BASE_DIR, BEST_MODEL_DIR)): os.makedirs(OUTPUT_DIR.format(BASE_DIR, BEST_MODEL_DIR)) start_ind = len([int(f.split('_')[0]) for f in os.listdir(OUTPUT_DIR.format(BASE_DIR, BEST_MODEL_DIR))]) for d in data_splits: # Standardize our outcome measure, like we did for training and validation outcome_key = outcome.split('_adv')[0] labels_target[d] = torch.FloatTensor((labels_target[d] - np.mean(df[outcome_key])) / np.std(df[outcome_key])) n_schools = torch.LongTensor(all_input_ids[d]).size(0) print ("num schools {} for {} split".format(n_schools, d)) for i in range(start_ind, n_schools): print (d, i) count += 1 # Prepare data input_ids = torch.LongTensor([all_input_ids[d][i]]).squeeze(0).to(device) num_sentences = int(sentences_per_school[d][i]) label_t = labels_target[d][i].unsqueeze(0).to(device) input_mask = torch.tensor([attention_masks[d][i]]).squeeze(0).to(device) label_perfrl = torch.tensor([perfrl[d][i]]).to(device) label_perwht = torch.tensor([perwht[d][i]]).to(device) lable_share_singleparent = torch.tensor([share_singleparent[d][i]]).to(device) label_totenrl = torch.tensor([totenrl[d][i]]).to(device) label_share_collegeplus = torch.tensor([share_collegeplus[d][i]]).to(device) label_mail_returnrate = torch.tensor([mail_returnrate[d][i]]).to(device) # Get the prediction for this example pred = model(input_ids, num_sentences, attention_mask=input_mask) mse = F.mse_loss(pred[0].unsqueeze_(0), label_t) # Generate reference sequence for integrated gradients ref_token_id = tokenizer.pad_token_id # A token used for generating token reference token_reference = TokenReferenceBase(reference_token_idx=ref_token_id) ref_input_ids = token_reference.generate_reference(input_ids.size(0), device=device).unsqueeze(1).repeat(1, input_ids.size(1)).long() # Compute integrated gradients lig = LayerIntegratedGradients(bert_forward_wrapper, model.bert.embeddings) attributions, conv_delta = lig.attribute( inputs=input_ids, baselines=ref_input_ids, additional_forward_args=(num_sentences, input_mask, 0), internal_batch_size=internal_batch_size, n_steps=n_steps, return_convergence_delta=True) # Sum attributions for each hidden dimension describing a token summarized_attr = attributions.sum(dim=-1).squeeze(0) n_sent = summarized_attr.size(0) attr_for_school_sents = defaultdict(dict) # Iterate over sentences and store the attributions per token in each sentence for j in range(0, n_sent): indices = input_ids[j].detach().squeeze(0).tolist() all_tokens = tokenizer.convert_ids_to_tokens(indices) attr_for_school_sents[j]['tokens'] = all_tokens attr_for_school_sents[j]['attributions'] = summarized_attr[j].tolist() assert (len(attr_for_school_sents[j]['tokens']) == len(attr_for_school_sents[j]['attributions'])) f = open(OUTPUT_FILE.format(BASE_DIR, BEST_MODEL_DIR, i, d, mse), 'w') f.write(json.dumps(attr_for_school_sents, indent=4)) f.close()
batch_size=batch_size, shuffle=False, num_workers=use_cpu, pin_memory=True) if use_gpu: model = model.cuda() for _, val_batch_data in enumerate(validloader): cur_batch_size = val_batch_data[0].size(0) exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0] exe_input = Variable(exe_input.long(), requires_grad=False) baseline = torch.zeros(1, 2000000) #ig = IntegratedGradients(model) lig = LayerIntegratedGradients(model, model.embed) attributions_ig, delta = lig.attribute(exe_input, baseline, n_steps=500, return_convergence_delta=True) #attributions, delta = ig.attribute(input, baseline, target=0, return_convergence_delta=True) print('IG Attributions:', attributions_ig) print('Convergence Delta:', delta) """ heatmap """ import matplotlib.pyplot as plt import matplotlib import numpy as np from matplotlib import cm from numpy.random import randn def heatmap(data, row_labels, col_labels, ax=None, cbar_kw={}, cbarlabel="", **kwargs):
def interpret_main(text, label): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = load_model( '/Users/andrewmendez1/Documents/ai-ml-challenge-2020/data/Finetune BERT oversampling 8_16_2020/Model_1_4_0/model.pt', device) def predict(inputs): #print('model(inputs): ', model(inputs)) return model.encoder(inputs)[0] def custom_forward(inputs): preds = predict(inputs) return torch.softmax(preds, dim=1)[:, 0] # load tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') ref_token_id = tokenizer.pad_token_id # A token used for generating token reference sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text. cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence hook = model.encoder.bert.embeddings.register_forward_hook(save_act) hook.remove() input_ids, ref_input_ids, sep_id = construct_input_ref_pair( text, tokenizer, device, ref_token_id, sep_token_id, cls_token_id) token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair( input_ids, device, sep_id) position_ids, ref_position_ids = construct_input_ref_pos_id_pair( input_ids, device) attention_mask = construct_attention_mask(input_ids) # text = "the exclusion of implied warranties is not permitted by some the above exclusion may not apply to"# label 0 lig = LayerIntegratedGradients(custom_forward, model.encoder.bert.embeddings) # attributions_main, delta_main = lig.attribute(inputs=input_ids,baselines=ref_input_ids,return_convergence_delta=True,n_steps=30) t0 = time() attributions, delta = lig.attribute( inputs=input_ids, baselines=ref_input_ids, # n_steps=7000, # internal_batch_size=5, return_convergence_delta=True, n_steps=300) st.write("Time to complete interpretation: {} seconds".format(time() - t0)) # print("Time in {} minutes".format( (time()-t0)/60 )) attributions_sum = summarize_attributions(attributions) all_tokens = tokenizer.convert_ids_to_tokens( input_ids[0].detach().tolist()) top_tokens, values, indicies = get_topk_attributed_tokens(attributions_sum, all_tokens, k=7) st.subheader("Top Tokens that the Model decided Unacceptability") import numpy as np plt.figure(figsize=(12, 6)) x_pos = np.arange(len(values)) plt.bar(x_pos, values.detach().numpy(), align='center') plt.xticks(x_pos, top_tokens, wrap=True) plt.xlabel("Tokens") plt.title( "Top 5 Tokens that made the model classify clause as unacceptable") st.pyplot() st.subheader( "Detailed Table showing Attribution Score to each word in clause") st.write(" ") st.write( "Positive Attributions mean that the words/tokens were \"positively\" attributed to the models's prediction." ) st.write( "Negative Attributions mean that the words/tokens were \"negatively\" attributed to the models's prediction." ) # res = ['{}({}) {:.3f}'.format(token, str(i),attributions_sum[i]) for i, token in enumerate(all_tokens)] df = pd.DataFrame({ 'Words': all_tokens, 'Attributions': attributions_sum.detach().numpy() }) st.table(df) score = predict(input_ids) score_vis = viz.VisualizationDataRecord( attributions_sum, torch.softmax(score, dim=1)[0][0], torch.argmax(torch.softmax(score, dim=1)[0]), label, text, attributions_sum.sum(), all_tokens, delta) print('\033[1m', 'Visualization For Score', '\033[0m') # from IPython.display import display, HTML, Image # viz.visualize_text([score_vis]) # st.write(display(Image(viz.visualize_text([score_vis])) ) ) # open('output.png', 'wb').write(im.data) # st.pyplot() # text= "this license shall be effective until company in its sole and absolute at any time and for any or no disable the or suspend or terminate this license and the rights afforded to you with or without prior notice or other action by upon the termination of this you shall cease all use of the app and uninstall the company will not be liable to you or any third party for or damages of any sort as a result of terminating this license in accordance with its and termination of this license will be without prejudice to any other right or remedy company may now or in the these obligations survive termination of this" # # label=1 # label = "?" # main(text,label)
for epoch in range(30): print(epoch) for sample in Data.train_iter: text = sample.text.permute(1,0).to(device) label = sample.label.to(device) black_idxs = [] for i, (t, l) in enumerate(zip(text, label)): black_idx = black_index_list[int(l)] for idx in black_idx: target_idx = torch.where(t==idx)[0] if list(target_idx.size())[0] == 1: black_idxs.append((i, target_idx.item())) model.train() optimizer.zero_grad() output = model(text) attributions,_ = lig.attribute(text, reference_tokens, target=label,\ return_convergence_delta=True) attributions = attributions.sum(dim=2).squeeze(0) attributions = attributions / torch.norm(attributions) target_attr = attributions.detach().clone() for i,j in black_idxs: target_attr[i,j] = 0 loss_mse = alpha*criterion_mse(attributions, target_attr) loss_ce = criterion_cls(output, label) print(loss_mse.item(), loss_ce.item()) loss = loss_mse + loss_ce loss.backward() optimizer.step() print("wo toxic:") evaluate(model, Data.valid_iter) print("with toxic:")
results = [] Result = namedtuple("result", "words label attribution") for sample in tqdm(train_data, ncols=100): words = tokenizer.preprocess(sample.text) if len(words) < seq_length: words += ['<pad>'] * (seq_length - len(words)) elif len(words) > seq_length: words = words[:seq_length] tokens = [tokenizer.vocab.stoi[word] for word in words] tokens = torch.LongTensor(tokens).unsqueeze(0).to(device) reference_tokens = token_reference.generate_reference(seq_length, device=device).unsqueeze(0) pred = model(tokens) plabel = int(torch.argmax(pred, 1)) pred = pred.tolist()[0][plabel] attributions, delta = lig.attribute(tokens, reference_tokens, target=sample.label,\ return_convergence_delta=True) attributions = attributions.sum(dim=2).squeeze(0) attributions = attributions / torch.norm(attributions) attributions = attributions.cpu().detach().numpy() unpad_index = [idx for idx,word in enumerate(words) if word != '<pad>'] unpad_words = [word for word in words if word != '<pad>'] unpad_attributions = attributions[unpad_index] results.append(Result(words=unpad_words, label=sample.label, attribution=unpad_attributions)) with open("checkpoints/results-CNN-debias.jl","wb") as f: joblib.dump([tuple(result) for result in results], f)