def setUp(self): self.config = BertConfig.from_json_file('fixtures/config.json') model = BertModel(self.config) tokenizer = BertTokenizer('fixtures/vocab.txt') self.attention_details_data = AttentionDetailsData(model, tokenizer)
ff.write("title\ttoken\tscore\n") for i, x in tqdm(f.iterrows()): sentence_a = str(x["text"]).replace("\n", "。").replace("〝", "").replace( "〞", "").replace("「", "").replace("」", "").strip() sentence_b = x["title"].replace("\n", "").replace("〝", "").replace( "〞", "").replace("「", "").replace("」", "").strip() if len(sentence_a) > 512 or len(sentence_a) > 512: too_long += 1 sentence_a = sentence_a[:512] sentence_b = sentence_b[:512] details_data = AttentionDetailsData(model, tokenizer) tokens_a, tokens_b, queries, keys, atts = details_data.get_data( sentence_a, sentence_b) attentions = _get_attention_details(tokens_a, tokens_b, queries, keys, atts) q_x_k_score = np.zeros((len(tokens_a), )) for j, k in enumerate(tokens_b): config = { "attention": attentions, "att_type": "ba", "vector_size": 64, "layer": 9, "att_head": 6, "query_index": j
class TestAttentionDetails(unittest.TestCase): def setUp(self): self.config = BertConfig.from_json_file('fixtures/config.json') model = BertModel(self.config) tokenizer = BertTokenizer('fixtures/vocab.txt') self.attention_details_data = AttentionDetailsData(model, tokenizer) def test_get_inputs(self): sentence1 = 'The quickest brown fox jumped over the lazy dog' tokens_ids1 = [2, 3, 4, 5, 6, 8, 9, 2, 14, 12] sentence2 = "the quick brown fox jumped over the laziest lazy elmo" token_ids2 = [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] tokens_tensor, token_type_tensor, tokens_a, tokens_b = self.attention_details_data._get_inputs( sentence1, sentence2) cls_id = 17 sep_id = 16 self.assertEqual(tokens_tensor.tolist()[0], [cls_id] + tokens_ids1 + [sep_id] + token_ids2 + [sep_id]) self.assertEqual(token_type_tensor.tolist()[0], ([0] * 12) + ([1] * 13)) def test_get_data(self): sentence1 = 'The quickest brown fox jumped over the lazy dog' tokens1 = [ 'the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog' ] sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = [ 'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', '[UNK]' ] tokens_a, tokens_b, queries, keys, atts = self.attention_details_data.get_data( sentence1, sentence2) self.assertEqual(tokens_a, ['[CLS]'] + tokens1 + ['[SEP]']) self.assertEqual(tokens_b, tokens2 + ['[SEP]']) batch_size = 1 query_key_size = self.config.hidden_size / self.config.num_attention_heads seq_len = len(tokens_a) + len(tokens_b) expected_shape = (self.config.num_hidden_layers, batch_size, self.config.num_attention_heads, seq_len, query_key_size) self.assertEqual(queries.shape, expected_shape) self.assertEqual(keys.shape, expected_shape) expected_shape = (self.config.num_hidden_layers, batch_size, self.config.num_attention_heads, seq_len, seq_len) self.assertEqual(atts.shape, expected_shape) def test_get_attention_details(self): sentence1 = 'The quickest brown fox jumped over the lazy dog' sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens_a, tokens_b, queries, keys, atts = self.attention_details_data.get_data( sentence1, sentence2) attention_details = _get_attention_details(tokens_a, tokens_b, queries, keys, atts) queries_squeezed = np.squeeze(queries) expected_all_queries = queries_squeezed.tolist() self.assertEqual(attention_details['all']['queries'], expected_all_queries) keys_squeezed = np.squeeze(keys) expected_all_keys = keys_squeezed.tolist() self.assertEqual(attention_details['all']['keys'], expected_all_keys) num_layers = self.config.num_hidden_layers num_heads = self.config.num_attention_heads vector_size = self.config.hidden_size / num_heads self.assertEqual( np.array(attention_details['aa']['queries']).shape, (num_layers, num_heads, len(tokens_a), vector_size)) self.assertEqual( np.array(attention_details['aa']['keys']).shape, (num_layers, num_heads, len(tokens_a), vector_size)) self.assertEqual( np.array(attention_details['bb']['queries']).shape, (num_layers, num_heads, len(tokens_b), vector_size)) self.assertEqual( np.array(attention_details['bb']['keys']).shape, (num_layers, num_heads, len(tokens_b), vector_size)) self.assertEqual( np.array(attention_details['ab']['queries']).shape, (num_layers, num_heads, len(tokens_a), vector_size)) self.assertEqual( np.array(attention_details['ab']['keys']).shape, (num_layers, num_heads, len(tokens_b), vector_size)) self.assertEqual( np.array(attention_details['ba']['queries']).shape, (num_layers, num_heads, len(tokens_b), vector_size)) self.assertEqual( np.array(attention_details['ba']['keys']).shape, (num_layers, num_heads, len(tokens_a), vector_size)) atts_squeezed = np.squeeze(atts) expected_all_attention = atts_squeezed.tolist() self.assertEqual(attention_details['all']['att'], expected_all_attention) attn_a = np.array(attention_details['aa']['att']) attn_b = np.array(attention_details['bb']['att']) attn_ab = np.array(attention_details['ab']['att']) attn_ba = np.array(attention_details['ba']['att']) expected_top_half = atts_squeezed[:, :, :len(tokens_a), :] top_half = np.concatenate((attn_a, attn_ab), axis=-1) self.assertEqual(top_half.shape, expected_top_half.shape) self.assertTrue(np.array_equal(top_half, expected_top_half)) expected_bottom_half = atts_squeezed[:, :, len(tokens_a):, :] bottom_half = np.concatenate((attn_ba, attn_b), axis=-1) self.assertEqual(bottom_half.shape, expected_bottom_half.shape) all = np.concatenate((top_half, bottom_half), axis=-2) self.assertEqual(all.shape, atts_squeezed.shape) self.assertTrue(np.allclose(all, atts_squeezed, atol=1e-06))
def analyzeAttentionSingleTupleDecoy(model, tokenizer, data, guid_dict=None, select_guid=None, num_layers = 12, num_heads = 12, do_debug = False): """ Extracts the attention of target words, e.g. groundtruth and decoy word. Note: If target word is tokenized, we only consider the max attention here. Parameters ---------- model : bertviz.pytorch_pretrained_bert.modeling.BertModel BERT model from BERT visualization that provides access to attention tokenizer: bertviz.pytorch_pretrained_bert.tokenization.BertTokenizer BERT tolenizer data: InputItems[] List of InputItems containing the WNLI/PDP data guid_dict: dictionary Dictionary that maps unique ids to data indices. Default None select_guid: int GUID of example data, for which the attentions are to be extracted num_layers: int Number of layers. Default 12 num_heads: int Number of attention heads. Default 12 do_debug: boolean Toggle for printing debug information. Default False Returns ------- activity : ndarray Count matrix, keeping track which layer and head is associated with max attention attention : ndarray Attention matrix (#layers, #heads, 2), containing for each head and layer the attention for true word and decoy, respectively """ problem_list = set([]) activity = np.zeros((num_layers,num_heads)) if select_guid is None: elements = range(0,len(data)) else: assert(guid_dict is not None) assert(select_guid is not None) elements = [guid_dict[select_guid]] for idx in elements: sentence_a = data[idx].text_a sentence_b = data[idx].text_b groundtruth = data[idx].groundtruth guid = data[idx].guid decoy = data[idx].decoy if groundtruth is not None: details_data = AttentionDetailsData(model, tokenizer) tokens_a, tokens_b, queries, keys, atts = details_data.get_data(sentence_a, sentence_b) attentions = get_attention_details(tokens_a, tokens_b, queries, keys, atts) groundtruth_tokens = tokenizer.tokenize(data[idx].groundtruth) activity = np.zeros((num_layers,num_heads,len(decoy)+1)) attention_matrix = np.zeros((num_layers,num_heads,len(decoy)+1)) reference_idx = data[idx].reference_idx if tokenizer.tokenize(groundtruth)[0] not in sentence_a and tokenizer.tokenize(groundtruth)[0] not in sentence_b: print('Wrong annotation: '+sentence_a+' | '+groundtruth+' | '+sentence_b) continue for layer_id in range(0,num_layers): for head_id in range(0,num_heads): attention_pairwise = np.asarray(attentions['ab']['att'][layer_id][head_id]) correct_activity = 0 indices = [] # determine attention for the correct word # check if correct word is in sentence_a OR sentence_b if contains_word(sentence_a, groundtruth_tokens[0]): # check if target is single or multi-token if len(tokenizer.tokenize(groundtruth)) == 1: # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary ratios = [fuzz.ratio(groundtruth, token) for token in tokens_a] best_match_idx = ratios.index(max(ratios)) correct_activity = attention_pairwise[best_match_idx,reference_idx] indices.append(best_match_idx) # target streches over multiple tokens else: groundtruth_split = tokenizer.tokenize(groundtruth) local_attention = [] for f in groundtruth_split: if len(f)>1: try: local_attention.append(attention_pairwise[tokens_a.index(f),reference_idx]) indices.append(tokens_a.index(f)) except: problem_list.add(guid) pass # keep max attention if len(local_attention) > 0: correct_activity = (np.max(local_attention)) else: # check if target is single or multi-token if len(tokenizer.tokenize(groundtruth)) == 1: correct_activity = attention_pairwise[reference_idx, tokens_b.index(groundtruth)] indices.append(tokens_b.index(groundtruth)) # target stretches over multiple tokens else: groundtruth_split = tokenizer.tokenize(groundtruth) local_attention = [] for f in groundtruth_split: if len(f)>1: local_attention.append(attention_pairwise[reference_idx, tokens_b.index(f)]) indices.append(tokens_b.index(f)) # keep max attention correct_activity = (np.max(local_attention)) # determine attention for the decoy word decoy_attention = [] if contains_word(sentence_a, groundtruth_tokens[0]): for k in decoy: # check if target is single or multi-token if len(tokenizer.tokenize(k)) == 1: # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary ratios = [fuzz.ratio(k, token) for token in tokens_a] best_match_idx = ratios.index(max(ratios)) decoy_attention.append(attention_pairwise[best_match_idx,reference_idx]) indices.append(best_match_idx) else: decoy_split = tokenizer.tokenize(k) local_attention = [] for f in decoy_split: if len(f)>1: try: local_attention.append(attention_pairwise[tokens_a.index(f),reference_idx]) indices.append(tokens_a.index(f)) except: problem_list.add(guid) pass if len(local_attention) > 0: decoy_attention.append(np.max(local_attention)) else: decoy_attention.append(0) else: for k in decoy: # check if target is single or multi-token if len(tokenizer.tokenize(k)) == 1: decoy_attention.append(attention_pairwise[reference_idx, tokens_b.index(k)]) else: decoy_split = tokenizer.tokenize(k) local_attention = [] for f in decoy_split: if len(f)>1: # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary ratios = [fuzz.ratio(f, token) for token in tokens_b] best_match_idx = ratios.index(max(ratios)) local_attention.append(attention_pairwise[reference_idx, best_match_idx]) indices.append(best_match_idx) if len(local_attention) > 0: decoy_attention.append(np.max(local_attention)) else: decoy_attention.append(0) attn = [correct_activity] + decoy_attention activity[head_id,layer_id, np.argmax(attn)]+=1 attention_matrix[head_id,layer_id, :] = np.asarray(attn[:]) if do_debug and len(problem_list) > 0: print('Problems with following guids: '+str(problem_list)) return activity, attention_matrix