Ejemplo n.º 1
0
def save_attn_weights(model, bert_version, output_path, prefix):
    tokenizer = BertTokenizer.from_pretrained(bert_version)

    filename = '/scratch/sb6416/Ling3340/extract_tree/UD_English-PUD/en_pud-ud-test.conllu'
    with open(filename, 'r') as f_wsj:
        data = json.load(f_wsj)

    attention = {}
    model.eval()
    with torch.no_grad():
        for key, datum in tqdm.tqdm(data.items(), total=len(data)):
            sent = datum['sentence']
            tokens = tokenizer.tokenize(sent)
            tokens_a_delim = ['[CLS]'] + tokens + ['[SEP]']
            token_ids = tokenizer.convert_tokens_to_ids(tokens_a_delim)
            tokens_tensor = torch.tensor([token_ids])
            token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)])
            _, _, attn_data_list = model(tokens_tensor,
                                         token_type_ids=token_type_tensor)
            attn_tensor = torch.stack(
                [attn_data['attn_probs'] for attn_data in attn_data_list])
            attention[prefix + key] = attn_tensor.data.numpy()

    print("writing weights to the file!!")

    with h5py.File(output_path, 'w') as f:
        for idx in attention:
            f.create_dataset(idx, data=attention[idx], dtype='float64')
    f.close()
    print("done")
Ejemplo n.º 2
0
    def test_bert_attn(self):
        self.config = BertConfig.from_json_file('fixtures/config.json')
        self.model = BertModel(self.config)
        self.tokenizer = BertTokenizer('fixtures/vocab.txt')
        sentence1 = 'The quickest brown fox jumped over the lazy dog'
        sentence2 = "the quick brown fox jumped over the laziest elmo"
        attn_data = get_attention_bert(self.model,
                                       self.tokenizer,
                                       sentence1,
                                       sentence2,
                                       include_queries_and_keys=False)

        tokens_1 = [
            '[CLS]', 'the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over',
            'the', 'lazy', 'dog', '[SEP]'
        ]
        tokens_2 = [
            'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la',
            '##zie', '##st', '[UNK]', '[SEP]'
        ]
        self.assertEqual(attn_data['all']['left_text'], tokens_1 + tokens_2)
        self.assertEqual(attn_data['all']['right_text'], tokens_1 + tokens_2)
        self.assertEqual(attn_data['aa']['left_text'], tokens_1)
        self.assertEqual(attn_data['aa']['right_text'], tokens_1)
        self.assertEqual(attn_data['ab']['left_text'], tokens_1)
        self.assertEqual(attn_data['ab']['right_text'], tokens_2)
        self.assertEqual(attn_data['ba']['left_text'], tokens_2)
        self.assertEqual(attn_data['ba']['right_text'], tokens_1)
        self.assertEqual(attn_data['bb']['left_text'], tokens_2)
        self.assertEqual(attn_data['bb']['right_text'], tokens_2)

        attn_all = attn_data['all']['attn']
        attn_aa = attn_data['aa']['attn']
        attn_ab = attn_data['ab']['attn']
        attn_ba = attn_data['ba']['attn']
        attn_bb = attn_data['bb']['attn']
        num_layers = len(attn_all)
        for layer in range(num_layers):
            attn_all_layer = torch.tensor(attn_all[layer])
            num_heads, seq_len, _ = attn_all_layer.size()
            # Check that probabilities sum to one
            sum_probs = attn_all_layer.sum(dim=-1)
            expected = torch.ones(num_heads, seq_len, dtype=torch.float32)
            self.assertTrue(torch.allclose(sum_probs, expected))
            # Reassemble attention from components and verify is correct
            attn_aa_layer = torch.tensor(attn_aa[layer])
            attn_ab_layer = torch.tensor(attn_ab[layer])
            attn_ba_layer = torch.tensor(attn_ba[layer])
            attn_bb_layer = torch.tensor(attn_bb[layer])
            top_half = torch.cat((attn_aa_layer, attn_ab_layer), dim=-1)
            bottom_half = torch.cat((attn_ba_layer, attn_bb_layer), dim=-1)
            whole = torch.cat((top_half, bottom_half), dim=-2)
            # assert self.assertAlmostEqual(torch.sum(torch.abs(whole - attn_all[layer])), 0)
            self.assertTrue(torch.allclose(whole, attn_all_layer))
Ejemplo n.º 3
0
def main():
    bert_version = 'bert-large-uncased'
    model = BertModel.from_pretrained(bert_version)
    tokenizer = BertTokenizer.from_pretrained(bert_version)
    output_path = '/misc/vlgscratch4/BowmanGroup/datasets/ptb_trees/trees_' + bert_version + '_wsj.hdf5'

    filename = '/misc/vlgscratch4/BowmanGroup/datasets/ptb_trees/ptb3-wsj-test.json'
    attn = create_dict(model, tokenizer, filename, 'test_')

    length = len(attn)
    print("len attention: ", length)
    print("writing weights to the file!!")
    with h5py.File(output_path, 'w') as f:
        for idx in attn:
            f.create_dataset(idx, data=attn[idx], dtype='float64')
    f.close()
    print("done")
Ejemplo n.º 4
0
def save_attn_weights(model, bert_version, output_path):
    tokenizer = BertTokenizer.from_pretrained(bert_version)

    filename = '/scratch/sb6416/Ling3340/extract_tree/UD_English-PUD/en_pud-ud-test.conllu'
    with open(filename, 'r') as f:
        data = f.readlines()
    sentences = {}

    for i in range(1, len(data)):
        line = data[i]
        if line[0] == '#':
            if line[2:9] == "sent_id":
                sentence_id = line[12:]

            if line[2:6] == "text":
                sentence = line[9:]
                sentences[sentence_id[:-1]] = sentence[:-1]

    # print(sentences)
    attention = {}
    model.eval()
    with torch.no_grad():
        for sent in tqdm.tqdm(sentences):
            tokens_sent = sentences[sent]
            tokens = tokenizer.tokenize(tokens_sent)
            # print(tokens_sent, tokens)
            tokens_a_delim = ['[CLS]'] + tokens + ['[SEP]']
            token_ids = tokenizer.convert_tokens_to_ids(tokens_a_delim)
            # print(token_ids, len(token_ids))
            tokens_tensor = torch.tensor([token_ids])
            token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)])
            # print(token_type_tensor)
            _, _, attn_data_list = model(tokens_tensor,
                                         token_type_ids=token_type_tensor)
            attn_tensor = torch.stack(
                [attn_data['attn_probs'] for attn_data in attn_data_list])
            attention[sent] = attn_tensor.data.numpy()

    print("writing weights to the file!!")

    with h5py.File(output_path, 'w') as f:
        for idx in attention:
            f.create_dataset(idx, data=attention[idx], dtype='float64')
    f.close()
    print("done")
 def setUp(self):
     self.config = BertConfig.from_json_file('fixtures/config.json')
     model = BertModel(self.config)
     tokenizer = BertTokenizer('fixtures/vocab.txt')
     self.attention_details_data = AttentionDetailsData(model, tokenizer)
Ejemplo n.º 6
0
    for i in range(seq_len):
        key_vector = keys[i]
        dotProduct = 0

        for j in range(config["vector_size"]):
            product = query_vector[j] * key_vector[j]
            dotProduct += product
        dotProducts.append(dotProduct)

    return dotProducts


bert_version = '/Users/bayartsogtyadamsuren/Downloads/bert-japanese-files/bert-wiki-ja'

model = BertModel.from_pretrained(bert_version)
tokenizer = BertTokenizer.from_pretrained(bert_version)

print("Head of the csv\n", f.head())
print("Number of lines", len(f))

q_x_k_scores = []
para_tokens = []
too_long = 0
errors = 0

ff.write("title\ttoken\tscore\n")

for i, x in tqdm(f.iterrows()):

    sentence_a = str(x["text"]).replace("\n", "。").replace("〝", "").replace(
        "〞", "").replace("「", "").replace("」", "").strip()
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Shou5ld contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name", choices=['PDP', 'MNLI', 'pdp', 'mnli'],
                        required=True,
                        help="The name of the task to train, pdp or WNLI.")
    parser.add_argument("--do_lower_case",
                    action='store_true',
                    help="Set this flag if you are using an uncased model.")
    parser.add_argument("--debug", action='store_true',
                    help="Set this flag if you are want to print debug infos.")

    args = parser.parse_args()

    if args.task_name.lower() == 'pdp':
      processor = processors.XMLPDPProcessor()
      tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
      all_ambigious, guid_dict = processor.get_train_items(args.data_dir, select_type='ambigious')
      all_guids = set([])
      for i in range(0,len(all_ambigious)):
          all_guids.add(all_ambigious[i].guid)

    elif args.task_name.lower() == 'mnli':
      processor = processors.XMLMnliProcessor()
      tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
      all_ambigious, guid_dict = processor.get_train_items(args.data_dir, select_type='ambigious')
      all_guids = set([])
      for i in range(0,len(all_ambigious)):
          all_guids.add(all_ambigious[i].guid)

    model = BertModel.from_pretrained(args.bert_model)
    
    MAS_list = []
    counter = np.zeros((2))

    for i in trange(0,len(all_guids)):
        _, attention = analyzeAttentionSingleTupleDecoy(model, tokenizer, all_ambigious, guid_dict, select_guid=i, do_debug=args.debug)
        ref_word = tokenizer.tokenize(all_ambigious[i].text_b)[all_ambigious[i].reference_idx]
        MAS = computeMaximumAttentionScore(attention)
        
        MAS_list.append(MAS[0])
        # now count how many time MAX is assigned either the true word or the decoy
        if np.argmax(MAS)==0:
            counter[0] += 1
        else:
            counter[1] += 1

        if args.debug:
            print(str(all_ambigious[i].guid) + ' | ' + str(MAS)+ ' | '+all_ambigious[i].text_a + ' '+all_ambigious[i].text_b + ' || >'+ref_word+'<, '+ str(all_ambigious[i].groundtruth)+', '+str(all_ambigious[i].decoy))

    print(args.task_name.upper()+" Accuracy: "+str(counter[0]/np.sum(counter)))
Ejemplo n.º 8
0
 def setUp(self):
     self.config = BertConfig.from_json_file('fixtures/config.json')
     model = BertModel(self.config)
     tokenizer = BertTokenizer('fixtures/vocab.txt')
     self.attention_visualizer = AttentionVisualizer(model, tokenizer)