def save_attn_weights(model, bert_version, output_path, prefix): tokenizer = BertTokenizer.from_pretrained(bert_version) filename = '/scratch/sb6416/Ling3340/extract_tree/UD_English-PUD/en_pud-ud-test.conllu' with open(filename, 'r') as f_wsj: data = json.load(f_wsj) attention = {} model.eval() with torch.no_grad(): for key, datum in tqdm.tqdm(data.items(), total=len(data)): sent = datum['sentence'] tokens = tokenizer.tokenize(sent) tokens_a_delim = ['[CLS]'] + tokens + ['[SEP]'] token_ids = tokenizer.convert_tokens_to_ids(tokens_a_delim) tokens_tensor = torch.tensor([token_ids]) token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)]) _, _, attn_data_list = model(tokens_tensor, token_type_ids=token_type_tensor) attn_tensor = torch.stack( [attn_data['attn_probs'] for attn_data in attn_data_list]) attention[prefix + key] = attn_tensor.data.numpy() print("writing weights to the file!!") with h5py.File(output_path, 'w') as f: for idx in attention: f.create_dataset(idx, data=attention[idx], dtype='float64') f.close() print("done")
def test_bert_attn(self): self.config = BertConfig.from_json_file('fixtures/config.json') self.model = BertModel(self.config) self.tokenizer = BertTokenizer('fixtures/vocab.txt') sentence1 = 'The quickest brown fox jumped over the lazy dog' sentence2 = "the quick brown fox jumped over the laziest elmo" attn_data = get_attention_bert(self.model, self.tokenizer, sentence1, sentence2, include_queries_and_keys=False) tokens_1 = [ '[CLS]', 'the', 'quick', '##est', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]' ] tokens_2 = [ 'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', '[UNK]', '[SEP]' ] self.assertEqual(attn_data['all']['left_text'], tokens_1 + tokens_2) self.assertEqual(attn_data['all']['right_text'], tokens_1 + tokens_2) self.assertEqual(attn_data['aa']['left_text'], tokens_1) self.assertEqual(attn_data['aa']['right_text'], tokens_1) self.assertEqual(attn_data['ab']['left_text'], tokens_1) self.assertEqual(attn_data['ab']['right_text'], tokens_2) self.assertEqual(attn_data['ba']['left_text'], tokens_2) self.assertEqual(attn_data['ba']['right_text'], tokens_1) self.assertEqual(attn_data['bb']['left_text'], tokens_2) self.assertEqual(attn_data['bb']['right_text'], tokens_2) attn_all = attn_data['all']['attn'] attn_aa = attn_data['aa']['attn'] attn_ab = attn_data['ab']['attn'] attn_ba = attn_data['ba']['attn'] attn_bb = attn_data['bb']['attn'] num_layers = len(attn_all) for layer in range(num_layers): attn_all_layer = torch.tensor(attn_all[layer]) num_heads, seq_len, _ = attn_all_layer.size() # Check that probabilities sum to one sum_probs = attn_all_layer.sum(dim=-1) expected = torch.ones(num_heads, seq_len, dtype=torch.float32) self.assertTrue(torch.allclose(sum_probs, expected)) # Reassemble attention from components and verify is correct attn_aa_layer = torch.tensor(attn_aa[layer]) attn_ab_layer = torch.tensor(attn_ab[layer]) attn_ba_layer = torch.tensor(attn_ba[layer]) attn_bb_layer = torch.tensor(attn_bb[layer]) top_half = torch.cat((attn_aa_layer, attn_ab_layer), dim=-1) bottom_half = torch.cat((attn_ba_layer, attn_bb_layer), dim=-1) whole = torch.cat((top_half, bottom_half), dim=-2) # assert self.assertAlmostEqual(torch.sum(torch.abs(whole - attn_all[layer])), 0) self.assertTrue(torch.allclose(whole, attn_all_layer))
def main(): bert_version = 'bert-large-uncased' model = BertModel.from_pretrained(bert_version) tokenizer = BertTokenizer.from_pretrained(bert_version) output_path = '/misc/vlgscratch4/BowmanGroup/datasets/ptb_trees/trees_' + bert_version + '_wsj.hdf5' filename = '/misc/vlgscratch4/BowmanGroup/datasets/ptb_trees/ptb3-wsj-test.json' attn = create_dict(model, tokenizer, filename, 'test_') length = len(attn) print("len attention: ", length) print("writing weights to the file!!") with h5py.File(output_path, 'w') as f: for idx in attn: f.create_dataset(idx, data=attn[idx], dtype='float64') f.close() print("done")
def save_attn_weights(model, bert_version, output_path): tokenizer = BertTokenizer.from_pretrained(bert_version) filename = '/scratch/sb6416/Ling3340/extract_tree/UD_English-PUD/en_pud-ud-test.conllu' with open(filename, 'r') as f: data = f.readlines() sentences = {} for i in range(1, len(data)): line = data[i] if line[0] == '#': if line[2:9] == "sent_id": sentence_id = line[12:] if line[2:6] == "text": sentence = line[9:] sentences[sentence_id[:-1]] = sentence[:-1] # print(sentences) attention = {} model.eval() with torch.no_grad(): for sent in tqdm.tqdm(sentences): tokens_sent = sentences[sent] tokens = tokenizer.tokenize(tokens_sent) # print(tokens_sent, tokens) tokens_a_delim = ['[CLS]'] + tokens + ['[SEP]'] token_ids = tokenizer.convert_tokens_to_ids(tokens_a_delim) # print(token_ids, len(token_ids)) tokens_tensor = torch.tensor([token_ids]) token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)]) # print(token_type_tensor) _, _, attn_data_list = model(tokens_tensor, token_type_ids=token_type_tensor) attn_tensor = torch.stack( [attn_data['attn_probs'] for attn_data in attn_data_list]) attention[sent] = attn_tensor.data.numpy() print("writing weights to the file!!") with h5py.File(output_path, 'w') as f: for idx in attention: f.create_dataset(idx, data=attention[idx], dtype='float64') f.close() print("done")
def setUp(self): self.config = BertConfig.from_json_file('fixtures/config.json') model = BertModel(self.config) tokenizer = BertTokenizer('fixtures/vocab.txt') self.attention_details_data = AttentionDetailsData(model, tokenizer)
for i in range(seq_len): key_vector = keys[i] dotProduct = 0 for j in range(config["vector_size"]): product = query_vector[j] * key_vector[j] dotProduct += product dotProducts.append(dotProduct) return dotProducts bert_version = '/Users/bayartsogtyadamsuren/Downloads/bert-japanese-files/bert-wiki-ja' model = BertModel.from_pretrained(bert_version) tokenizer = BertTokenizer.from_pretrained(bert_version) print("Head of the csv\n", f.head()) print("Number of lines", len(f)) q_x_k_scores = [] para_tokens = [] too_long = 0 errors = 0 ff.write("title\ttoken\tscore\n") for i, x in tqdm(f.iterrows()): sentence_a = str(x["text"]).replace("\n", "。").replace("〝", "").replace( "〞", "").replace("「", "").replace("」", "").strip()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Shou5ld contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", choices=['PDP', 'MNLI', 'pdp', 'mnli'], required=True, help="The name of the task to train, pdp or WNLI.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--debug", action='store_true', help="Set this flag if you are want to print debug infos.") args = parser.parse_args() if args.task_name.lower() == 'pdp': processor = processors.XMLPDPProcessor() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) all_ambigious, guid_dict = processor.get_train_items(args.data_dir, select_type='ambigious') all_guids = set([]) for i in range(0,len(all_ambigious)): all_guids.add(all_ambigious[i].guid) elif args.task_name.lower() == 'mnli': processor = processors.XMLMnliProcessor() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) all_ambigious, guid_dict = processor.get_train_items(args.data_dir, select_type='ambigious') all_guids = set([]) for i in range(0,len(all_ambigious)): all_guids.add(all_ambigious[i].guid) model = BertModel.from_pretrained(args.bert_model) MAS_list = [] counter = np.zeros((2)) for i in trange(0,len(all_guids)): _, attention = analyzeAttentionSingleTupleDecoy(model, tokenizer, all_ambigious, guid_dict, select_guid=i, do_debug=args.debug) ref_word = tokenizer.tokenize(all_ambigious[i].text_b)[all_ambigious[i].reference_idx] MAS = computeMaximumAttentionScore(attention) MAS_list.append(MAS[0]) # now count how many time MAX is assigned either the true word or the decoy if np.argmax(MAS)==0: counter[0] += 1 else: counter[1] += 1 if args.debug: print(str(all_ambigious[i].guid) + ' | ' + str(MAS)+ ' | '+all_ambigious[i].text_a + ' '+all_ambigious[i].text_b + ' || >'+ref_word+'<, '+ str(all_ambigious[i].groundtruth)+', '+str(all_ambigious[i].decoy)) print(args.task_name.upper()+" Accuracy: "+str(counter[0]/np.sum(counter)))
def setUp(self): self.config = BertConfig.from_json_file('fixtures/config.json') model = BertModel(self.config) tokenizer = BertTokenizer('fixtures/vocab.txt') self.attention_visualizer = AttentionVisualizer(model, tokenizer)