コード例 #1
0
 def setUp(self):
     self.config = BertConfig.from_json_file('fixtures/config.json')
     model = BertModel(self.config)
     tokenizer = BertTokenizer('fixtures/vocab.txt')
     self.attention_details_data = AttentionDetailsData(model, tokenizer)
コード例 #2
0
ff.write("title\ttoken\tscore\n")

for i, x in tqdm(f.iterrows()):

    sentence_a = str(x["text"]).replace("\n", "。").replace("〝", "").replace(
        "〞", "").replace("「", "").replace("」", "").strip()
    sentence_b = x["title"].replace("\n", "").replace("〝", "").replace(
        "〞", "").replace("「", "").replace("」", "").strip()

    if len(sentence_a) > 512 or len(sentence_a) > 512:
        too_long += 1
        sentence_a = sentence_a[:512]
        sentence_b = sentence_b[:512]

    details_data = AttentionDetailsData(model, tokenizer)
    tokens_a, tokens_b, queries, keys, atts = details_data.get_data(
        sentence_a, sentence_b)
    attentions = _get_attention_details(tokens_a, tokens_b, queries, keys,
                                        atts)
    q_x_k_score = np.zeros((len(tokens_a), ))

    for j, k in enumerate(tokens_b):

        config = {
            "attention": attentions,
            "att_type": "ba",
            "vector_size": 64,
            "layer": 9,
            "att_head": 6,
            "query_index": j
コード例 #3
0
class TestAttentionDetails(unittest.TestCase):
    def setUp(self):
        self.config = BertConfig.from_json_file('fixtures/config.json')
        model = BertModel(self.config)
        tokenizer = BertTokenizer('fixtures/vocab.txt')
        self.attention_details_data = AttentionDetailsData(model, tokenizer)

    def test_get_inputs(self):
        sentence1 = 'The quickest brown fox jumped over the lazy dog'
        tokens_ids1 = [2, 3, 4, 5, 6, 8, 9, 2, 14, 12]
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        token_ids2 = [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        tokens_tensor, token_type_tensor, tokens_a, tokens_b = self.attention_details_data._get_inputs(
            sentence1, sentence2)
        cls_id = 17
        sep_id = 16
        self.assertEqual(tokens_tensor.tolist()[0], [cls_id] + tokens_ids1 +
                         [sep_id] + token_ids2 + [sep_id])
        self.assertEqual(token_type_tensor.tolist()[0],
                         ([0] * 12) + ([1] * 13))

    def test_get_data(self):
        sentence1 = 'The quickest brown fox jumped over the lazy dog'
        tokens1 = [
            'the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over', 'the',
            'lazy', 'dog'
        ]
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = [
            'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la',
            '##zie', '##st', 'lazy', '[UNK]'
        ]
        tokens_a, tokens_b, queries, keys, atts = self.attention_details_data.get_data(
            sentence1, sentence2)
        self.assertEqual(tokens_a, ['[CLS]'] + tokens1 + ['[SEP]'])
        self.assertEqual(tokens_b, tokens2 + ['[SEP]'])
        batch_size = 1
        query_key_size = self.config.hidden_size / self.config.num_attention_heads
        seq_len = len(tokens_a) + len(tokens_b)
        expected_shape = (self.config.num_hidden_layers, batch_size,
                          self.config.num_attention_heads, seq_len,
                          query_key_size)
        self.assertEqual(queries.shape, expected_shape)
        self.assertEqual(keys.shape, expected_shape)
        expected_shape = (self.config.num_hidden_layers, batch_size,
                          self.config.num_attention_heads, seq_len, seq_len)
        self.assertEqual(atts.shape, expected_shape)

    def test_get_attention_details(self):
        sentence1 = 'The quickest brown fox jumped over the lazy dog'
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens_a, tokens_b, queries, keys, atts = self.attention_details_data.get_data(
            sentence1, sentence2)
        attention_details = _get_attention_details(tokens_a, tokens_b, queries,
                                                   keys, atts)
        queries_squeezed = np.squeeze(queries)
        expected_all_queries = queries_squeezed.tolist()
        self.assertEqual(attention_details['all']['queries'],
                         expected_all_queries)
        keys_squeezed = np.squeeze(keys)
        expected_all_keys = keys_squeezed.tolist()
        self.assertEqual(attention_details['all']['keys'], expected_all_keys)
        num_layers = self.config.num_hidden_layers
        num_heads = self.config.num_attention_heads
        vector_size = self.config.hidden_size / num_heads
        self.assertEqual(
            np.array(attention_details['aa']['queries']).shape,
            (num_layers, num_heads, len(tokens_a), vector_size))
        self.assertEqual(
            np.array(attention_details['aa']['keys']).shape,
            (num_layers, num_heads, len(tokens_a), vector_size))
        self.assertEqual(
            np.array(attention_details['bb']['queries']).shape,
            (num_layers, num_heads, len(tokens_b), vector_size))
        self.assertEqual(
            np.array(attention_details['bb']['keys']).shape,
            (num_layers, num_heads, len(tokens_b), vector_size))
        self.assertEqual(
            np.array(attention_details['ab']['queries']).shape,
            (num_layers, num_heads, len(tokens_a), vector_size))
        self.assertEqual(
            np.array(attention_details['ab']['keys']).shape,
            (num_layers, num_heads, len(tokens_b), vector_size))
        self.assertEqual(
            np.array(attention_details['ba']['queries']).shape,
            (num_layers, num_heads, len(tokens_b), vector_size))
        self.assertEqual(
            np.array(attention_details['ba']['keys']).shape,
            (num_layers, num_heads, len(tokens_a), vector_size))

        atts_squeezed = np.squeeze(atts)
        expected_all_attention = atts_squeezed.tolist()
        self.assertEqual(attention_details['all']['att'],
                         expected_all_attention)
        attn_a = np.array(attention_details['aa']['att'])
        attn_b = np.array(attention_details['bb']['att'])
        attn_ab = np.array(attention_details['ab']['att'])
        attn_ba = np.array(attention_details['ba']['att'])
        expected_top_half = atts_squeezed[:, :, :len(tokens_a), :]
        top_half = np.concatenate((attn_a, attn_ab), axis=-1)
        self.assertEqual(top_half.shape, expected_top_half.shape)
        self.assertTrue(np.array_equal(top_half, expected_top_half))
        expected_bottom_half = atts_squeezed[:, :, len(tokens_a):, :]
        bottom_half = np.concatenate((attn_ba, attn_b), axis=-1)
        self.assertEqual(bottom_half.shape, expected_bottom_half.shape)
        all = np.concatenate((top_half, bottom_half), axis=-2)
        self.assertEqual(all.shape, atts_squeezed.shape)
        self.assertTrue(np.allclose(all, atts_squeezed, atol=1e-06))
コード例 #4
0
def analyzeAttentionSingleTupleDecoy(model, tokenizer, data, guid_dict=None, select_guid=None, num_layers = 12, num_heads = 12, do_debug = False):

    """
    Extracts the attention of target words, e.g. groundtruth and decoy word.

    Note: If target word is tokenized, we only consider the max attention here.


    Parameters
    ----------
    model : bertviz.pytorch_pretrained_bert.modeling.BertModel
        BERT model from BERT visualization that provides access to attention
    tokenizer:  bertviz.pytorch_pretrained_bert.tokenization.BertTokenizer
        BERT tolenizer
    data: InputItems[]
        List of InputItems containing the WNLI/PDP data
    guid_dict: dictionary
        Dictionary that maps unique ids to data indices. Default None
    select_guid: int
        GUID of example data, for which the attentions are to be extracted
    num_layers: int
        Number of layers. Default 12
    num_heads: int
        Number of attention heads. Default 12
    do_debug: boolean
        Toggle for printing debug information. Default False

    Returns
    -------
    
    activity : ndarray
        Count matrix, keeping track which layer and head is associated with max attention
    attention : ndarray
        Attention matrix (#layers, #heads, 2), containing for each head and layer the attention for true word and decoy, respectively

    """

    problem_list = set([])
    activity = np.zeros((num_layers,num_heads))

    if select_guid is None:
        elements = range(0,len(data))
    else:
        assert(guid_dict is not None)
        assert(select_guid is not None)
        elements = [guid_dict[select_guid]]

    for idx in elements:
        
        sentence_a = data[idx].text_a
        sentence_b = data[idx].text_b
        groundtruth = data[idx].groundtruth
        guid = data[idx].guid
        decoy = data[idx].decoy
        
        if groundtruth is not None:
            
            details_data = AttentionDetailsData(model, tokenizer)
            tokens_a, tokens_b, queries, keys, atts = details_data.get_data(sentence_a, sentence_b)
            attentions = get_attention_details(tokens_a, tokens_b, queries, keys, atts)
            
            
            groundtruth_tokens = tokenizer.tokenize(data[idx].groundtruth)
            activity = np.zeros((num_layers,num_heads,len(decoy)+1))
            attention_matrix = np.zeros((num_layers,num_heads,len(decoy)+1))
            reference_idx = data[idx].reference_idx
            if tokenizer.tokenize(groundtruth)[0] not in sentence_a and tokenizer.tokenize(groundtruth)[0] not in sentence_b:
                print('Wrong annotation: '+sentence_a+' | '+groundtruth+' | '+sentence_b)
                continue
                
            for layer_id in range(0,num_layers):
                for head_id in range(0,num_heads):
                    
                    attention_pairwise = np.asarray(attentions['ab']['att'][layer_id][head_id])

                    correct_activity = 0
                    indices = []
                    
                    # determine attention for the correct word
                    
                    # check if correct word is in sentence_a OR sentence_b
                    if contains_word(sentence_a, groundtruth_tokens[0]):


                        # check if target is single or multi-token
                        if len(tokenizer.tokenize(groundtruth)) == 1:
                            # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary
                            ratios = [fuzz.ratio(groundtruth, token) for token in tokens_a]
                            best_match_idx = ratios.index(max(ratios))
                                
                            correct_activity = attention_pairwise[best_match_idx,reference_idx]
                            indices.append(best_match_idx)    
                            
                        # target streches over multiple tokens
                        else:
                            groundtruth_split = tokenizer.tokenize(groundtruth)
                            local_attention = []
                            for f in groundtruth_split:
                                if len(f)>1:
                                    try:                                      
                                        local_attention.append(attention_pairwise[tokens_a.index(f),reference_idx])
                                        indices.append(tokens_a.index(f))
                                    except:
                                        problem_list.add(guid)
                                        pass
                            # keep max attention
                            if len(local_attention) > 0:
                                correct_activity = (np.max(local_attention))


                    else:

                        # check if target is single or multi-token
                        if len(tokenizer.tokenize(groundtruth)) == 1:
                            correct_activity = attention_pairwise[reference_idx, tokens_b.index(groundtruth)]
                            indices.append(tokens_b.index(groundtruth))
                            
                        # target stretches over multiple tokens
                        else:
                            groundtruth_split = tokenizer.tokenize(groundtruth)
                            local_attention = []
                            for f in groundtruth_split:
                                if len(f)>1:
                                    local_attention.append(attention_pairwise[reference_idx, tokens_b.index(f)])
                                    indices.append(tokens_b.index(f))
                            # keep max attention
                            correct_activity = (np.max(local_attention))

                    
                    # determine attention for the decoy word
                    
                    decoy_attention = []
                    
                    if contains_word(sentence_a, groundtruth_tokens[0]):
                        
                        for k in decoy:
                            # check if target is single or multi-token
                            if len(tokenizer.tokenize(k)) == 1:
                            
                                # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary
                                ratios = [fuzz.ratio(k, token) for token in tokens_a]
                                best_match_idx = ratios.index(max(ratios))
                                decoy_attention.append(attention_pairwise[best_match_idx,reference_idx])
                                

                                indices.append(best_match_idx)
                            else:
                                decoy_split = tokenizer.tokenize(k)
                                local_attention = []
                                
                                for f in decoy_split:
                                    if len(f)>1:
                                        try:
                                            local_attention.append(attention_pairwise[tokens_a.index(f),reference_idx])
                                            indices.append(tokens_a.index(f))
                                        except:
                                            problem_list.add(guid)
                                            pass
                                            
                                            
                                    
                                if len(local_attention) > 0:    
                                    decoy_attention.append(np.max(local_attention))
                                else:
                                    decoy_attention.append(0)
        
                    else:
                        for k in decoy:
                            # check if target is single or multi-token
                            if len(tokenizer.tokenize(k)) == 1:
                                decoy_attention.append(attention_pairwise[reference_idx, tokens_b.index(k)])
                            else:
                                
                                decoy_split = tokenizer.tokenize(k)
                                local_attention = []
                                for f in decoy_split:
                                    if len(f)>1:

                                         # some answers might not be perfect match or misspellings, e.g. plural piece(s), so fuzzy matching necessary
                                        ratios = [fuzz.ratio(f, token) for token in tokens_b]
                                        best_match_idx = ratios.index(max(ratios))

                                        local_attention.append(attention_pairwise[reference_idx, best_match_idx])
                                        indices.append(best_match_idx)
                                    
                                if len(local_attention) > 0:    
                                    decoy_attention.append(np.max(local_attention))
                                else:
                                    decoy_attention.append(0)
                                    

                    attn = [correct_activity] + decoy_attention
                    
                    activity[head_id,layer_id, np.argmax(attn)]+=1
                    attention_matrix[head_id,layer_id, :] = np.asarray(attn[:])

    if do_debug and len(problem_list) > 0:
        print('Problems with following guids: '+str(problem_list))
    return activity, attention_matrix