def predict_and_fit_locality(args, batch, model, tokenizer, batch_features, batch_examples): model.eval() batch = tuple(t.to(args.device) for t in batch) # only allow batch size 1 assert batch[0].size(0) == 1 # run predictions with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if args.model_type in ["roberta", "distilbert", "camembert", "bart"]: del inputs["token_type_ids"] feature_indices = batch[3] outputs = model.restricted_forward(**inputs) batch_start_logits, batch_end_logits = outputs batch_results = [] for i, feature_index in enumerate(feature_indices): eval_feature = batch_features[i] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) batch_results.append(result) batch_prelim_results, batch_predictions = compute_predictions_index_and_logits( batch_examples, batch_features, batch_results, args.n_best_size, args.max_answer_length, args.do_lower_case, tokenizer, args.dataset) # run attributions batch_start_indexes = torch.LongTensor( [x.start_index for x in batch_prelim_results]).to(args.device) batch_end_indexes = torch.LongTensor( [x.end_index for x in batch_prelim_results]).to(args.device) # for data parallel inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_indexes": batch_start_indexes, "end_indexes": batch_end_indexes, "final_start_logits": batch_start_logits, "final_end_logits": batch_end_logits, } if args.model_type in ["roberta", "distilbert", "camembert", "bart"]: del inputs["token_type_ids"] with torch.no_grad(): importances = fit_locality(args, tokenizer, model, inputs, batch_features[0]) return batch_predictions, batch_prelim_results, importances
def perturb_interp(args, model, tokenizer, prefix=""): if not os.path.exists(args.interp_dir): os.makedirs(args.interp_dir) # fix the model model.requires_grad_(False) dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) # assume one on on mapping assert len(examples) == len(features) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.eval_batch_size = 1 eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_predictions = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): feature_indices = to_list(batch[3]) batch_features = [features[i] for i in feature_indices] batch_examples = [examples[i] for i in feature_indices] # batch prem, batch predictions batch = remove_padding(batch, batch_features[0]) batch_predictions, batch_prelim_results, batch_attributions = predict_and_fit_locality( args, batch, model, tokenizer, batch_features, batch_examples) dump_token_interp_info(args, batch_examples, batch_features, tokenizer, batch_predictions, batch_prelim_results, batch_attributions) # lots of info, dump to files immediately all_predictions.append(batch_predictions) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) all_predictions = merge_predictions(all_predictions) results = hotpot_evaluate(examples[:len(all_predictions)], all_predictions) return results
def predict_and_layerwise_attribute(args, batch, model, tokenizer, batch_features, batch_examples): model.eval() batch = tuple(t.to(args.device) for t in batch) num_layers = model.num_hidden_layers # run predictions with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "output_attentions": True, } if args.model_type in ["roberta", "distilbert", "camembert", "bart"]: del inputs["token_type_ids"] feature_indices = batch[3] outputs = model.restricted_forward(**inputs) batch_start_logits, batch_end_logits, batch_attentions = outputs outputs = outputs[:-1] batch_results = [] for i, feature_index in enumerate(feature_indices): eval_feature = batch_features[i] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) batch_results.append(result) batch_prelim_results, batch_predictions = compute_predictions_index_and_logits( batch_examples, batch_features, batch_results, args.n_best_size, args.max_answer_length, args.do_lower_case, tokenizer, args.dataset, ) # run attributions batch_start_indexes = torch.LongTensor([x.start_index for x in batch_prelim_results]).to(args.device) batch_end_indexes = torch.LongTensor([x.end_index for x in batch_prelim_results]).to(args.device) batch_attentions = torch.stack(batch_attentions) active_layers = [1 for _ in range(num_layers)] # for data parallel inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "active_layers": active_layers, "input_attentions": batch_attentions, "start_indexes": batch_start_indexes, "end_indexes": batch_end_indexes, "final_start_logits": batch_start_logits, "final_end_logits": batch_end_logits, "num_steps": args.ig_steps, } if args.model_type in ["roberta", "distilbert", "camembert", "bart"]: del inputs["token_type_ids"] batch_attributions = model.layer_attribute(**inputs) # print(batch_attributions.size()) # attribution in logits return batch_predictions, batch_prelim_results, batch_attentions, batch_attributions
def attention_interp(args, model, tokenizer, prefix=""): if not os.path.exists(args.interp_dir): os.makedirs(args.interp_dir) dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) # fix the model model.requires_grad_(False) # assume one on on mapping assert len(examples) == len(features) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # restrict evak batch size assert args.per_gpu_eval_batch_size == 1 and args.n_gpu <= 1 args.eval_batch_size = 1 # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_predictions = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): feature_indices = to_list(batch[3]) batch_features = [features[i] for i in feature_indices] batch_examples = [examples[i] for i in feature_indices] # batch prem, batch predictions batch = remove_padding(batch, batch_features[0]) batch_predictions, batch_prelim_results, batch_attentions, batch_attributions = predict_and_layerwise_attribute( args, batch, model, tokenizer, batch_features, batch_examples ) # lots of info, dump to files immediately dump_attention_interp_info(args, batch_examples, batch_features, tokenizer, batch_predictions, batch_prelim_results, batch_attentions, batch_attributions) all_predictions.append(batch_predictions) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions # output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) # output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) # Compute the F1 and exact scores. all_predictions = merge_predictions(all_predictions) results = hotpot_evaluate(examples[:len(all_predictions)], all_predictions) return results