def load_bert_like(): disable_eager_execution() model = BertLike() sess = init_session() #sess.run(tf.compat.v1.global_variables_initializer()) load_v2_to_v2(sess, get_bert_full_path(), False) attention_prob_list, = sess.run([model.attention_probs_list]) html = HtmlVisualizer("position.html") for layer_no, attention_prob in enumerate(attention_prob_list): html.write_headline("Layer {}".format(layer_no)) acc_dict = {} zero_scores = [list() for _ in range(12)] for loc in range(2, 40, 2): print("Source : ", loc) for target_loc in range(20): offset = target_loc - loc print(offset, end=" ") for head_idx in range(num_head): key = offset, head_idx if key not in acc_dict: acc_dict[key] = [] e = attention_prob[0, head_idx, loc, target_loc] if target_loc != 0: acc_dict[key].append(e) else: zero_scores[head_idx].append(e) print("{0:.2f}".format(e * 100), end=" ") print() rows = [[Cell("Loc")] + [Cell("Head{}".format(i)) for i in range(12)]] for offset in range(-7, +7): print(offset, end=" ") scores = [] for head_idx in range(12): key = offset, head_idx try: elems = acc_dict[key] if len(elems) < 3: raise KeyError avg = average(elems) scores.append(avg) print("{0:.2f}".format(avg * 100), end=" ") except KeyError: print("SKIP") print() rows.append([Cell(offset)] + [Cell(float(v * 100), v * 1000) for v in scores]) html.write_table(rows) html.write_paragraph("Attention to first token") zero_scores = [average(l) for l in zero_scores] rows = [[Cell(" ")] + [Cell("Head{}".format(i)) for i in range(12)], [Cell(" ")] + [Cell(float(v * 100), v * 1000) for v in zero_scores]] html.write_table(rows)
def load_and_visualize(): tokenizer = tokenizer_wo_tf.FullTokenizer( os.path.join(data_path, "bert_voca.txt")) data_id = "1" n_list = open(os.path.join(output_path, "lookup_n", data_id), "r").readlines() p = os.path.join(output_path, "example_loss.pickle") data = pickle.load(open(p, "rb")) data = data[0]["masked_lm_example_loss"] feature_itr = load_record_v1( os.path.join(output_path, "lookup_example", data_id)) n = len(n_list) feature_idx = 0 html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False) for i in range(n): n_sample = int(n_list[i]) rows = [] assert n_sample > 0 for j in range(n_sample): feature = feature_itr.__next__() input_ids = take(feature["input_ids"]) masked_lm_ids = take(feature["masked_lm_ids"]) masked_lm_positions = take(feature["masked_lm_positions"]) input_mask = take(feature["input_mask"]) selected_word = take(feature["selected_word"]) d_input_ids = take(feature["d_input_ids"]) d_location_ids = take(feature["d_location_ids"]) word_tokens = tokenizer.convert_ids_to_tokens(selected_word) word = tokenizer_wo_tf.pretty_tokens((word_tokens)) emph_word = "<b>" + word + "</b>" if j == 0: mask_ans = {} masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids) for pos, id in zip(list(masked_lm_positions), masked_terms): mask_ans[pos] = id tokens = tokenizer.convert_ids_to_tokens(input_ids) for i in range(len(tokens)): if tokens[i] == "[MASK]": tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i]) if i in d_location_ids and i is not 0: if tokens[i - 1] != emph_word: tokens[i] = emph_word else: tokens[i] = "-" def_str = tokenizer_wo_tf.pretty_tokens( tokenizer.convert_ids_to_tokens(d_input_ids), True) row = list() row.append(Cell(word)) row.append(Cell(data[feature_idx])) row.append(Cell(def_str)) rows.append(row) feature_idx += 1 s = tokenizer_wo_tf.pretty_tokens(tokens, True) html_writer.write_paragraph(s) html_writer.write_table(rows) html_writer.close()
def diff_view(): tokenizer = get_tokenizer() filename = "bert_815.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) filename = "bfn_3_200_815.pickle" p = os.path.join(output_path, filename) data2 = pickle.load(open(p, "rb")) run_name = "diff" batch_size, seq_length = data[0]['masked_input_ids'].shape masked_input_ids = [] input_ids = [] masked_lm_example_loss = [] masked_lm_positions = [] masked_lm_ids = [] for e in data[:-1]: masked_input_ids.append(e["masked_input_ids"]) input_ids.append(e["input_ids"]) masked_lm_example_loss.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_positions.append(e["masked_lm_positions"]) masked_lm_ids.append(e["masked_lm_ids"]) masked_lm_example_loss2 = [] for e in data2[:-1]: masked_lm_example_loss2.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_example_loss2 = np.concatenate(masked_lm_example_loss2) input_ids = np.concatenate(input_ids) masked_input_ids = np.concatenate(masked_input_ids) masked_lm_example_loss = np.concatenate(masked_lm_example_loss) masked_lm_positions = np.concatenate(masked_lm_positions) masked_lm_ids = np.concatenate(masked_lm_ids) html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False) n_instance = len(input_ids) for inst_idx in range(n_instance): tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx]) ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) ans_keys = dict( zip(masked_lm_positions[inst_idx], tokenizer.convert_ids_to_tokens(masked_lm_ids[inst_idx]))) loss_at_loc = { p: l for l, p in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]) } loss_at_loc2 = { p: l for l, p in zip(masked_lm_example_loss2[inst_idx], masked_lm_positions[inst_idx]) } score_at_loc = {k: math.exp(-v) for k, v in loss_at_loc.items()} score_at_loc2 = {k: math.exp(-v) for k, v in loss_at_loc2.items()} def is_dependent(token): return len(token) == 1 and not token[0].isalnum() cells = [] for i in range(len(tokens)): f_inverse = False score = 0 if tokens[i] == "[MASK]" or i in loss_at_loc: tokens[i] = "[{}-{}]".format(i, ans_keys[i]) score = (score_at_loc2[i] - score_at_loc[i]) * 180 score = -score if score < 0: f_inverse = True score = abs(score) if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" if tokens[i] != "[PAD]": term = tokens[i] cont_left = term[:2] == "##" cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##" if i + 1 < len(tokens): dependent_right = is_dependent(tokens[i + 1]) else: dependent_right = False dependent_left = is_dependent(tokens[i]) if cont_left: term = term[2:] space_left = " " if not (cont_left or dependent_left) else "" space_right = " " if not (cont_right or dependent_right) else "" if not f_inverse: cells.append(Cell(term, score, space_left, space_right)) else: cells.append( Cell(term, score, space_left, space_right, target_color="R")) #s = tokenization.pretty_tokens(tokens) rows = [] row = [] for cell in cells: row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] loss_infos = [] for loss, pos in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]): loss_infos.append((loss, pos)) loss_infos.sort(key=lambda x: x[1]) rows = [] for loss, pos in loss_infos: loss1 = score_at_loc[pos] loss2 = score_at_loc2[pos] loss_diff = loss1 - loss2 rows.append((Cell(pos), Cell(loss1), Cell(loss2), Cell(loss_diff))) html_writer.write_table(rows) html_writer.close()
def pred_loss_view(): tokenizer = get_tokenizer() filename = "tlm_loss_pred.pickle" filename = "tlm_loss_pred_on_dev.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) batch_size, seq_length = data[0]['input_ids'].shape keys = list(data[0].keys()) vectors = {} for e in data: for key in keys: if key not in vectors: vectors[key] = [] vectors[key].append(e[key]) for key in keys: vectors[key] = np.concatenate(vectors[key], axis=0) html_writer = HtmlVisualizer("pred_make_sense_dev.html", dark_mode=False) n_instance = len(vectors['input_ids']) n_instance = min(n_instance, 100) for inst_idx in range(n_instance): tokens = tokenizer.convert_ids_to_tokens( vectors['input_ids'][inst_idx]) locations = list(vectors['masked_lm_positions'][inst_idx]) def is_dependent(token): return len(token) == 1 and not token[0].isalnum() cells = [] for i in range(len(tokens)): f_same_pred = False score = 0 if i in locations and i != 0: i_idx = locations.index(i) tokens[i] = "[{}:{}]".format(i_idx, tokens[i]) pred_diff = vectors['pred_diff'][inst_idx][i_idx] gold_diff = vectors['gold_diff'][inst_idx][i_idx] pred_label = pred_diff > 0.3 gold_label = gold_diff > 0.3 if pred_label: score = 100 if gold_label: f_same_pred = True else: if gold_label: score = 30 f_same_pred = False if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" if tokens[i] != "[PAD]": term = tokens[i] cont_left = term[:2] == "##" cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##" if i + 1 < len(tokens): dependent_right = is_dependent(tokens[i + 1]) else: dependent_right = False dependent_left = is_dependent(tokens[i]) if cont_left: term = term[2:] space_left = " " if not (cont_left or dependent_left) else "" space_right = " " if not (cont_right or dependent_right) else "" if f_same_pred: cells.append(Cell(term, score, space_left, space_right)) else: cells.append( Cell(term, score, space_left, space_right, target_color="R")) row = [] for cell in cells: row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] row_head = [ Cell("Index"), Cell("P]Prob1"), Cell("P]Prob2"), Cell("G]Prob1"), Cell("G]Prob2"), Cell("P]Diff"), Cell("G]Diff"), ] def f_cell(obj): return Cell("{:04.2f}".format(obj)) rows = [row_head] pred_diff_list = [] gold_diff_list = [] for idx, pos in enumerate(locations): if pos == 0: break pred_diff = vectors['pred_diff'][inst_idx][idx] gold_diff = vectors['gold_diff'][inst_idx][idx] pred_diff_list.append(pred_diff) gold_diff_list.append(gold_diff) row = [ Cell(idx), f_cell(vectors['prob1'][inst_idx][idx]), f_cell(vectors['prob2'][inst_idx][idx]), f_cell(math.exp(-vectors['loss_base'][inst_idx][idx])), f_cell(math.exp(-vectors['loss_target'][inst_idx][idx])), f_cell(pred_diff), f_cell(gold_diff), ] rows.append(row) html_writer.write_table(rows) pred_diff = np.average(pred_diff_list) gold_diff = np.average(gold_diff_list) html_writer.write_paragraph( "Average Pred diff ={:04.2f} Observed diff={:04.2f} ".format( pred_diff, gold_diff)) if pred_diff > 0.3: html_writer.write_headline("High Drop") elif pred_diff < 0.1: html_writer.write_headline("Low Drop")
def work(): tokenizer = get_tokenizer() filename = "bert_815.pickle" filename = "bfn_3_200_815.pickle" run_name = filename[:-(len(".pickle"))] p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) batch_size, seq_length = data[0]['masked_input_ids'].shape masked_input_ids = [] input_ids = [] masked_lm_example_loss = [] masked_lm_positions = [] for e in data[:-1]: masked_input_ids.append(e["masked_input_ids"]) input_ids.append(e["input_ids"]) masked_lm_example_loss.append( np.reshape(e["masked_lm_example_loss"], [batch_size, -1])) masked_lm_positions.append(e["masked_lm_positions"]) input_ids = np.concatenate(input_ids) masked_input_ids = np.concatenate(masked_input_ids) masked_lm_example_loss = np.concatenate(masked_lm_example_loss) masked_lm_positions = np.concatenate(masked_lm_positions) html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False) n_instance = len(input_ids) for inst_idx in range(200): tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx]) ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx]) loss_at_loc = { p: l for l, p in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]) } cells = [] for i in range(len(tokens)): score = 0 if tokens[i] == "[MASK]": tokens[i] = "[{}]".format(ans_tokens[i]) score = loss_at_loc[i] * 255 / 25 if tokens[i] == "[SEP]": tokens[i] = "[SEP]<br>" if tokens[i] != "[PAD]": cells.append(Cell(tokens[i], score)) #s = tokenization.pretty_tokens(tokens) rows = [] row = [] for cell in cells: row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] loss_infos = [] for loss, pos in zip(masked_lm_example_loss[inst_idx], masked_lm_positions[inst_idx]): loss_infos.append((loss, pos)) loss_infos.sort(key=lambda x: x[1]) rows = [] for loss, pos in loss_infos: rows.append((Cell(pos), Cell(loss))) html_writer.write_table(rows) html_writer.close()
def do(): pred_file_name = "RLPP_0.pickle" pred_file_name = "ukp_rel.pickle" record_file_name = "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0" record_file_name = "C:\\work\\Code\\Chair\\output\\tf_enc" todo = [ ("RLPP_0.pickle", "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0", "RLPP_wiki.html"), ("ukp_rel.pickle", "C:\\work\\Code\\Chair\\output\\tf_enc", "RLPP_ukp.html") ] x = [] y = [] for pred_file_name, record_file_name, out_name in todo: viewer = EstimatorPredictionViewerGosford(pred_file_name) html = HtmlVisualizer(out_name) itr1 = load_record_v2(record_file_name) itr2 = viewer.__iter__() cnt = 0 for features, entry in zip(itr1, itr2): cnt += 1 if cnt > 200: break input_ids1 = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") cells = viewer.cells_from_tokens(input_ids1) p1_l = [] p2_l = [] useful_l = [] row1 = [] row2 = [] row3 = [] row4 = [] for j, cell in enumerate(cells): p1 = float(prob1[j]) p2 = float(prob2[j]) x.append([p1]) y.append(p2) u = useful(p1, p2) score = (1 - u) * 100 cell.highlight_score = score row1.append(cell) row2.append(Cell(p1, score)) row3.append(Cell(p2, score)) row4.append(Cell(u, score)) p1_l.append(p1) p2_l.append(p2) useful_l.append(u) if len(row1) > 20: rows = [row1, row2, row3, row4] row1 = [] row2 = [] row3 = [] row4 = [] html.write_table(rows) html.write_paragraph("p1: {}".format(average(p1_l))) html.write_paragraph("p2: {}".format(average(p2_l))) html.write_paragraph("useful: {}".format(average(useful_l))) if average(useful_l) < 0.4: html.write_headline("Low Score") l = list(zip(x, y)) random.shuffle(l) l = l[:1000] x, y = zip(*l) lin = LinearRegression() lin.fit(x, y) poly = PolynomialFeatures(degree=4) X_poly = poly.fit_transform(x) poly.fit(X_poly, y) lin2 = LinearRegression() lin2.fit(X_poly, y) plt.scatter(x, y, color='blue') plt.plot(x, lin2.predict(poly.fit_transform(x)), color='red') plt.title('Polynomial Regression') plt.show()
def write_deletion_score_to_html(out_file_name, summarized_table: List[Entry], info: Dict[int, Dict]): text_to_info = claim_text_to_info() html = HtmlVisualizer(out_file_name) tokenizer = get_biobert_tokenizer() num_print = 0 for entry in summarized_table: tokens = tokenizer.convert_ids_to_tokens(entry.input_ids) idx_sep1, idx_sep2 = get_sep_loc(entry.input_ids) max_change = 0 max_drop = 0 cells = cells_from_tokens(tokens) drops = [] for idx in range(len(tokens)): if tokens[idx] == "[PAD]": break if tokens[idx] == '[SEP]': continue if idx in entry.contribution: raw_score = entry.contribution[idx] e = idx, raw_score drops.append(e) drops.sort(key=get_second) _, largest_drop = drops[0] max_drop_idx = -1 max_drop_case_logit = None for idx in range(len(tokens)): if tokens[idx] == "[PAD]": break if tokens[idx] == '[SEP]': continue if idx in entry.contribution: raw_score = entry.contribution[idx] max_change = max(abs(raw_score), max_change) if max_drop > raw_score: max_drop = raw_score max_drop_idx = idx max_drop_case_logit = entry.case_logits_d[idx] if raw_score < 0: score = abs(raw_score / largest_drop) * 200 color = "B" else: score = 0 color = "B" else: score = 150 color = "Gray" cells[idx].highlight_score = score cells[idx].target_color = color if max_change < 0.05 and False: pass else: # if random.random() < 0.90: # continue base_probs = scipy.special.softmax(entry.base_logits) info_entry = info[str(entry.data_id[0])] claim1_info: Dict = text_to_info[info_entry['text1']] claim2_info: Dict = text_to_info[info_entry['text2']] question = claim1_info['question'] assertion1 = claim1_info['assertion'] assertion2 = claim2_info['assertion'] original_prediction_summary = make_prediction_summary_str( base_probs) html.write_bar() html.write_paragraph("Question: {}".format(question)) html.write_paragraph("Original prediction: " + original_prediction_summary) html.write_paragraph("Max drop") rows = [] for idx, score in drops[:5]: row = [Cell(str(idx)), Cell(tokens[idx]), Cell(score)] rows.append(row) html.write_table(rows) min_token = tokens[max_drop_idx] html.write_paragraph("> \"{}\": {} ".format(min_token, max_drop)) max_drop_case_prob = scipy.special.softmax(max_drop_case_logit) max_drop_prediction_summary = make_prediction_summary_str( max_drop_case_prob) html.write_paragraph("> " + max_drop_prediction_summary) p = [Cell("Claim1 ({}):".format(assertion1))] + cells[1:idx_sep1] h = [Cell("Claim2 ({}):".format(assertion2)) ] + cells[idx_sep1 + 1:idx_sep2] html.write_table([p]) html.write_table([h]) num_print += 1 print("printed {} of {}".format(num_print, len(summarized_table)))