def show_tfrecord(file_path): itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(file_path) html = HtmlVisualizer(name + ".html") for features in itr: input_ids = take(features["input_ids"]) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] html.write_paragraph("Label : {}".format(label)) html.write_table([p_cells]) html.write_table([h_cells])
def show(out_file_name, summarized_table: List[Entry]): html = HtmlVisualizer(out_file_name) tokenizer = get_tokenizer() num_print = 0 for input_ids, prob, contributions in summarized_table: tokens = tokenizer.convert_ids_to_tokens(input_ids) html.write_paragraph("Score : {}".format(prob)) cells = [] max_change = 0 for idx in range(len(input_ids)): token = tokens[idx] if token == "[PAD]": break if idx in contributions: raw_score = contributions[idx] max_change = max(abs(raw_score), max_change) score = abs(raw_score) * 100 color = "R" if raw_score > 0 else "B" c = Cell(token, highlight_score=score, target_color=color) else: c = Cell(token, highlight_score=150, target_color="Gray") cells.append(c) if max_change < 0.05: pass else: html.multirow_print(cells, 30) num_print += 1 print("printed {} of {}".format(num_print, len(summarized_table)))
def print_param(): p_base = load_param(get_bert_full_path()) nli_path = "C:\work\Code\Chair\output\model\\runs\\nli_model.ckpt-75000_NLI\\model-0" p_ft = load_param(nli_path) keys = list(p_base.keys()) key = "bert/encoder/layer_0/output/dense/kernel" param1 = p_base[key] param2 = p_ft[key] html = HtmlVisualizer("bert_dense_param.html") l , c = param1.shape s_score = 100 for i in range(l): rows = [] row1 = [] row2 = [] s_score = 100 - s_score score = s_score for j in range(c): score = 100 - score row1.append(Cell("{0:.4f}".format(param1[i, j]), score)) row2.append(Cell("{0:.4f}".format(param2[i, j]), score)) rows.append(row1) rows.append(row2) html.write_table(rows)
def draw2(in_file, out_file): filename = os.path.join(output_path, in_file) data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer(out_file, dark_mode=False) tokenizer = get_tokenizer() for inst_i, entry in enumerate(data): if inst_i > 100: break tokens = entry.get_tokens("input_ids") # tokens = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") real_loss1 = entry.get_vector("per_example_loss1") real_loss2 = entry.get_vector("per_example_loss2") masked_lm_positions = entry.get_vector("masked_lm_positions") for i, loc in enumerate(masked_lm_positions): tokens[loc] = "[{}:{}]".format(i, tokens[loc]) html_writer.multirow_print(data.cells_from_tokens(tokens)) row2 = [Cell("prob1:")] + data.cells_from_anything(prob1) row3 = [Cell("prob2:")] + data.cells_from_anything(prob2) row4 = [Cell("real_loss1:")] + data.cells_from_anything(real_loss1) row5 = [Cell("real_loss2:")] + data.cells_from_anything(real_loss2) html_writer.multirow_print_from_cells_list([row2, row3, row4, row5])
def run(): tokenizer = get_tokenizer() spr = StreamPickleReader("contradiction_prediction") html = HtmlVisualizer("contradiction_prediction.html") cnt = 0 while spr.has_next(): item = spr.get_item() e, p = item input_ids, _, _ = e logit, explain = p tokens = tokenizer.convert_ids_to_tokens(input_ids) p, h = split_p_h_with_input_ids(tokens, input_ids) p_score, h_score = split_p_h_with_input_ids(explain, input_ids) p_score = normalize(p_score) h_score = normalize(h_score) p_cells = [Cell("P:")] + cells_from_tokens(p, p_score) h_cells = [Cell("H:")] + cells_from_tokens(h, h_score) html.write_paragraph(str(logit)) html.multirow_print(p_cells) html.multirow_print(h_cells) if cnt > 100: break cnt += 1
def view_grad_overlap_per_mask(): filename = "ukp_lm_probs.pickle" out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) tokenizer = data.tokenizer for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() highlight = lmap(is_mask, tokens) scores = entry.get_vector("overlap_score") pos_list = entry.get_vector("masked_lm_positions") probs = entry.get_vector("masked_lm_log_probs") probs = np.reshape(probs, [20, -1]) rows = [] for score, position, prob in zip(scores, pos_list, probs): tokens[position] = "{}-".format(position) + tokens[position] row = [Cell(position), Cell(score)] for idx in np.argsort(prob)[::-1][:5]: term = tokenizer.inv_vocab[idx] p = math.exp(prob[idx]) row.append(Cell(term)) row.append(Cell(p)) rows.append(row) cells = data.cells_from_tokens(tokens, highlight) for score, position in zip(scores, pos_list): cells[position].highlight_score = score / 10000 * 255 html_writer.multirow_print(cells, 20) html_writer.write_table(rows)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) claims = get_all_claims() claim_d = claims_to_dict(claims) keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:10]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = remove_duplicate(list([e.doc_id for e in entries])) claim = claim_d[int(query_id)] s = "{} : {}".format(query_id, claim) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer("claim_docs_urls.html") html.write_table(rows)
def main(config): # select claims # load relevant documents # remove duplicate q_res_path = config['q_res_path'] ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_text_d = json.load(open(config['query_text_d'])) save_name = config['save_path'] keys = list(ranked_list.keys()) keys.sort() num_doc_per_query = 10 url_prefix = "http://localhost:36559/document?identifier=" rows = [] for query_id in keys[:100]: entries: List[SimpleRankedListEntry] = ranked_list[query_id] entries = entries[:num_doc_per_query * 3] doc_ids: List[str] = list([e.doc_id for e in entries]) query_text = query_text_d[query_id] s = "{} : {}".format(query_id, query_text) rows.append([Cell(s)]) for doc_id in doc_ids[:num_doc_per_query]: url = url_prefix + doc_id s = "<a href=\"{}\">{}</a>".format(url, doc_id) rows.append([Cell(s)]) html = HtmlVisualizer(save_name) html.write_table(rows)
def get_cell(score): if score == "-": return Cell("-") else: # 0.01 -> 100 norm_score = min(abs(score) * 10000, 100) color = "B" if score > 0 else "R" return Cell("", highlight_score=norm_score, target_color=color)
def show_simple(run_name, data_id, tex_visulizer): num_tags = 3 num_select = 20 pickle_name = "save_view_{}_{}".format(run_name, data_id) tokenizer = get_tokenizer() data_loader = get_modified_data_loader2(HPSENLI3(), NLIExTrainConfig()) explain_entries = load_from_pickle(pickle_name) explain_entries = explain_entries selected_instances = [[], [], []] for idx, entry in enumerate(explain_entries): x0, logits, scores = entry pred = np.argmax(logits) input_ids = x0 p, h = data_loader.split_p_h_with_input_ids(input_ids, input_ids) p_tokens = tokenizer.convert_ids_to_tokens(p) h_tokens = tokenizer.convert_ids_to_tokens(h) p_tokens = restore_capital_letter(p_tokens) h_tokens = restore_capital_letter(h_tokens) target_tag = ["match", "mismatch", "conflict"][pred] tag_idx = data_generator.NLI.nli_info.tags.index(target_tag) tag_name = data_generator.NLI.nli_info.tags[tag_idx] p_score, h_score = data_loader.split_p_h_with_input_ids(scores[tag_idx], input_ids) normalize_fn = normalize p_score = normalize_fn(p_score) h_score = normalize_fn(h_score) p_row = [Cell("\\textbf{P:}")] + cells_from_tokens(p_tokens, p_score) h_row = [Cell("\\textbf{H:}")] + cells_from_tokens(h_tokens, h_score) pred_str = ["entailment", "neutral" , "contradiction"][pred] apply_color(p_row, tag_name) apply_color(h_row, tag_name) #tex_visulizer.write_paragraph(pred_str) if len(selected_instances[pred]) < num_select : e = pred_str, [p_row, h_row] #tex_visulizer.write_instance(pred_str, gold_label, [p_row, h_row]) selected_instances[pred].append(e) if all([len(s) == num_select for s in selected_instances]): break for insts in selected_instances: for e in insts: pred_str, rows = e tex_visulizer.write_instance(pred_str, rows) return selected_instances
def get_cell_from_token(token, log_odd): if token in stopwords: log_odd = 0 if log_odd > 0: s = min(150, log_odd * 50) c = Cell(token, s, target_color="B") elif log_odd < 0: s = min(150, -log_odd * 50) c = Cell(token, s, target_color="R") else: c = Cell(token) return c
def visualize_prediction_data(data_id): tokenizer = get_tokenizer() num_samples_list = open( os.path.join(working_path, "entry_prediction_n", data_id), "r").readlines() p = os.path.join(working_path, "entry_loss", "entry{}.pickle".format(data_id)) loss_outputs_list = pickle.load(open(p, "rb")) print("Loaded input data") loss_outputs = [] for e in loss_outputs_list: loss_outputs.extend(e["masked_lm_example_loss"]) print("Total of {} loss outputs".format(len(loss_outputs))) instance_idx = 0 feature_itr = load_record_v2( os.path.join(working_path, "entry_prediction_tf.done", data_id)) n = len(num_samples_list) n = 100 html = HtmlVisualizer("entry_prediction.html") for i in range(n): n_sample = int(num_samples_list[i]) assert n_sample > 0 first_inst = feature_itr.__next__() feature = Feature2Text(first_inst, tokenizer) html.write_headline("Input:") html.write_paragraph(feature.get_input_as_text(True, True)) html.write_headline("Word:" + feature.get_selected_word_text()) if instance_idx + n_sample >= len(loss_outputs): break if n_sample == 1: continue rows = [] no_dict_loss = loss_outputs[instance_idx] row = [Cell(no_dict_loss, 0), Cell("")] rows.append(row) instance_idx += 1 for j in range(1, n_sample): feature = Feature2Text(feature_itr.__next__(), tokenizer) def_cell = Cell(feature.get_def_as_text()) loss = loss_outputs[instance_idx] hl_score = 100 if loss < no_dict_loss * 0.9 else 0 row = [Cell(loss, hl_score), def_cell] rows.append(row) instance_idx += 1 html.write_table(rows)
def get_cell_from_token2(token, probs): if token.lower() in stopwords: probs = 0 probs = probs * 1e5 s = min(100, probs) c = Cell(token, s) return c
def cells_from_tokens(tokens, scores=None, stop_at_pad=True): cells = [] for i, token in enumerate(tokens): if tokens[i] == "[PAD]" and stop_at_pad: break term = tokens[i] cont_left = term[:2] == "##" cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##" if i + 1 < len(tokens): dependent_right = is_dependent(tokens[i + 1]) else: dependent_right = False dependent_left = is_dependent(tokens[i]) if cont_left: term = term[2:] space_left = " " if not (cont_left or dependent_left) else "" space_right = " " if not (cont_right or dependent_right) else "" if scores is not None: score = scores[i] else: score = 0 cells.append(Cell(term, score, space_left, space_right)) return cells
def get_table(get_pair_score_at): head = [Cell("")] + [Cell(t) for t in p_tokens] rows = [head] for h_idx in range(h_idx_max + 1): row = [Cell(h_tokens[h_idx])] for p_idx in range(p_idx_max + 1): s = get_pair_score_at((p_idx, h_idx)) one_del_score = group_summary.score_d[(p_idx, -1)] two_del_score = group_summary.score_d[(p_idx, h_idx)] tooltip_str = "{} -> {}".format( float_arr_to_str_arr(one_del_score), float_arr_to_str_arr(two_del_score)) row.append( get_tooltip_cell(two_digit_float(s), tooltip_str)) rows.append(row) return rows
def main(): save_name = sys.argv[1] out_dir = os.path.join(output_path, "cppnc") exist_or_mkdir(out_dir) info_file_path = os.path.join(out_dir, "cppnc_triple_all_dev_info") pred_file_path = os.path.join(out_dir, save_name + ".score") cid_and_confidences = get_confidence_list_per_cid(info_file_path, pred_file_path) rows = [] for cid, confidenc_list in cid_and_confidences.items(): row = list() row.append(Cell(str(cid))) row.extend([Cell("", highlight_score=c*100) for c in confidenc_list]) rows.append(row) html = HtmlVisualizer("confidence.html") html.write_table(rows)
def main(): save_name = "alamri_mismatch_all" output_d = load_from_pickle(save_name) html = HtmlVisualizer("alamri_mismatch.html") tokenizer = get_tokenizer() logits_grouped_by_layer = output_d["per_layer_logits"] num_layers = 12 def float_arr_to_cell(head, float_arr): return [Cell(head)] + lmap(Cell, map(two_digit_float, float_arr)) def float_arr_to_cell2(head, float_arr): return [Cell(head)] + lmap(Cell, map("{0:.4f}".format, float_arr)) num_data = len(output_d['input_ids']) for data_idx in range(num_data)[:100]: def get(name): return output_d[name][data_idx] tokens = tokenizer.convert_ids_to_tokens(get("input_ids")) ex_scores = get('ex_scores') probs = scipy.special.softmax(get('logits')) pred_str = make_prediction_summary_str(probs) html.write_paragraph("Prediction: {}".format(pred_str)) html.write_paragraph("gold label={}".format(get("label"))) row1 = [Cell("")] + list( [Cell(t, int(s * 100)) for t, s in zip(tokens, ex_scores)]) row2 = float_arr_to_cell("ex_prob", ex_scores) for i, s in enumerate(ex_scores): if s > 0.5: row2[i + 1].highlight_score = 100 rows = [row1, row2] for layer_no in range(num_layers): layer_logit = logits_grouped_by_layer[layer_no][data_idx] probs = sigmoid(layer_logit) row = float_arr_to_cell("layer_{}".format(layer_no), probs[:, 1]) rows.append(row) html.write_table(rows)
def get_cells(tokens, scores): cap = max(max(scores), 1) factor = 100 / cap def normalize_score(s): return min(s * factor, 100) return list( [Cell(t, normalize_score(s)) for t, s in zip(tokens, scores)])
def main(): #claim_d = load_train_claim_d() html = HtmlVisualizer("doc_relevance_and_value.html") rows = [] data_id = 0 for query, k_list in load_qk(): claim_id = query.query_id claim_text = query.text doc_ids = set([k.doc_id for k in k_list]) for doc_id in list(doc_ids)[:10]: url = os.path.join(output_path, "pc_docs_html", doc_id + ".html") a = "<a href=\"{}\">url</a>".format(url) #tab_print(data_id, claim_id, doc_id) row = [Cell(data_id), Cell(claim_id), Cell(claim_text), Cell(a)] rows.append(row) data_id += 1 html.write_table(rows)
def format_scores(raw_scores): def format_float(s): return "{0:.2f}".format(s) norm_scores = normalize_fn(raw_scores) cells = [Cell(format_float(s1), s2, False, False) for s1, s2 in zip(raw_scores, norm_scores)] if tag_name == "mismatch": set_cells_color(cells, "G") elif tag_name == "conflict": set_cells_color(cells, "R") return cells
def show_prediction(filename, file_path, correctness_1, correctness_2): data = EstimatorPredictionViewerGosford(filename) itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(filename) html = HtmlVisualizer(name + ".html") idx = 0 for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if not correctness_1[idx] or not correctness_2[idx]: html.write_paragraph("Label : {} Correct: {}/{}".format( label, correctness_1[idx], correctness_2[idx])) html.write_table([p_cells]) html.write_table([h_cells]) idx += 1
def write_feature_to_html(feature, html, tokenizer): input_ids = take(feature['input_ids']) focus_msak = take(feature['focus_mask']) label_ids = take(feature['label_ids']) text1 = tokenizer.convert_ids_to_tokens(input_ids) row = [] for i in range(len(input_ids)): highlight_score = 100 if focus_msak[i] else 0 row.append(Cell(text1[i], highlight_score)) html.write_headline("{}".format(label_ids[0])) html.multirow_print(row)
def main(): file_path = sys.argv[1] name = os.path.basename(file_path) viewer = EstimatorPredictionViewer(file_path) html = HtmlVisualizer("toke_score_gold.html") stopwords = load_stopwords_for_query() skip = 10 for entry_idx, entry in enumerate(viewer): if entry_idx % skip != 0: continue tokens = entry.get_tokens("input_ids") input_ids = entry.get_vector("input_ids") label_ids = entry.get_vector("label_ids") label_ids = np.reshape(label_ids, [-1, 2]) log_label_ids = np.log(label_ids + 1e-10) seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids) pad_idx = tokens.index("[PAD]") assert pad_idx > 0 logits = entry.get_vector("logits") cells = [] cells2 = [] for idx in range(pad_idx): probs = label_ids[idx] token = tokens[idx] score = probs[0] color = "B" if score > 0 else "R" highlight_score = min(abs(score) * 10000, 100) if token in stopwords: highlight_score = 0 if token in seg1: highlight_score = 50 color = "G" c = Cell(token, highlight_score=highlight_score, target_color=color) cells.append(c) html.multirow_print_from_cells_list([cells, cells2]) if entry_idx > 10000: break
def analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer): batch_size = 16 seq_len = 200 hidden_dim = 768 reshaped_grad = reshape_gradienet(tt_grad, seq_len, hidden_dim, False) hv_tt, x_list = reshape(hv_tt) hv_lm, x_list = reshape(hv_lm) assert len(hv_lm) == len(hv_tt) html = HtmlVisualizer("Preserved.html") for inst_i in range(len(hv_lm)): print("\t", end="") tokens = tokenizer.convert_ids_to_tokens(x_list[inst_i]) for seq_i in range(seq_len): token = tokenizer.convert_ids_to_tokens([x_list[inst_i, seq_i]])[0] print("{}".format(token), end="\t") print() scores = [] for layer_i in range(13): if layer_i != 1: continue layer_no = layer_i if layer_no >= 1: print("Layer {} :".format(layer_no), end="\t") else: print("Embedding:", end="\t") for seq_i in range(seq_len): n_diff_1, n_diff_2 = diff_and_grad( hv_lm[inst_i, layer_i, seq_i], hv_tt[inst_i, layer_i, seq_i], reshaped_grad[inst_i, layer_i, seq_i]) scores.append(n_diff_1) print("{}({})".format(n_diff_1, n_diff_2), end="\t") print("\n") row = [] for t, s in zip(tokens, scores): score = s / hidden_dim * 100 row.append(Cell(t, score)) html.write_table([row]) print("-----------------")
def per_doc_score(): filename = "fetch_hidden_dim.pickle" html_writer = HtmlVisualizer("preserved.html", dark_mode=False) p = os.path.join(output_path, filename) raw_data = pickle.load(open(p, "rb")) n_skip = 0 data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): if inst_i > 100: break count_preserved = entry.get_vector("layer_count") tokens = entry.get_tokens("input_ids") cells = data.cells_from_tokens(tokens) valid_parst = count_preserved[:len(cells)] avg = np.average(count_preserved) row = [] row2 = [] #f_print = avg > 20 f_print = True print(avg) if f_print: html_writer.write_paragraph("Skipped {} articles".format(n_skip)) n_skip = 0 for idx, cell in enumerate(cells): score = count_preserved[idx] / 728 * 100 cell.highlight_score = score row.append(cell) row2.append((Cell(count_preserved[idx], score))) if len(row) == 20: html_writer.write_table([row, row2]) row = [] row2 = [] html_writer.write_paragraph(str(avg)) else: n_skip += 1
def show(filename): data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer("token_scoring.html", dark_mode=False) correctness = [] for entry in data: tokens = entry.get_tokens("input_ids") logits = entry.get_vector("logits") masks = entry.get_vector("label_masks") ids = entry.get_vector("labels") token_row = [] pred_row = [] gold_row = [] rows = [token_row, pred_row, gold_row] for idx, token in enumerate(tokens): token_cell = Cell(token) if token == "[PAD]": break model_score = logits[idx][0] if masks[idx]: correct = (model_score > 0 and ids[idx] > 0) or (model_score < 0 and ids[idx] < 0) color = "B" if correct else "R" if correct and (model_score > 0 and ids[idx] > 0) : color = "G" pred_cell = Cell("{0:.2f}".format(model_score), 100, target_color=color) gold_cell = Cell("{0:.2f}".format(ids[idx]), 100, target_color=color) else: token_cell = Cell(token) pred_cell = Cell("") gold_cell = Cell("") token_row.append(token_cell) pred_row.append(pred_cell) gold_row.append(gold_cell) html_writer.multirow_print_from_cells_list(rows, 20)
def get_cell(token) -> Cell: if token.lower() in pc_tokens_set: score = 100 else: score = 0 return Cell(token, score)
def main(): n_factor = 16 step_size = 16 max_seq_length = 128 max_seq_length2 = 128 - 16 batch_size = 8 info_file_path = at_output_dir("robust", "seg_info") queries = load_robust_04_query("desc") qid_list = get_robust_qid_list() f_handler = get_format_handler("qc") info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) print(len(info)) tokenizer = get_tokenizer() for job_idx in [1]: qid = qid_list[job_idx] query = queries[str(qid)] q_term_length = len(tokenizer.tokenize(query)) data_path1 = os.path.join(output_path, "robust", "windowed_{}.score".format(job_idx)) data_path2 = os.path.join(output_path, "robust", "windowed_small_{}.score".format(job_idx)) data1 = OutputViewer(data_path1, n_factor, batch_size) data2 = OutputViewer(data_path2, n_factor, batch_size) segment_len = max_seq_length - 3 - q_term_length segment_len2 = max_seq_length2 - 3 - q_term_length outputs = [] for d1, d2 in zip(data1, data2): # for each query, doc pairs cur_info1 = info[d1['data_id']] cur_info2 = info[d2['data_id']] query_doc_id1 = f_handler.get_pair_id(cur_info1) query_doc_id2 = f_handler.get_pair_id(cur_info2) assert query_doc_id1 == query_doc_id2 doc = d1['doc'] probs = get_probs(d1['logits']) probs2 = get_probs(d2['logits']) n_pred_true = np.count_nonzero(np.less(0.5, probs)) print(n_pred_true, len(probs)) seg_scores: List[Tuple[int, int, float]] = get_piece_scores( n_factor, probs, segment_len, step_size) seg_scores2: List[Tuple[int, int, float]] = get_piece_scores( n_factor, probs2, segment_len2, step_size) ss_list = [] for st, ed, score in seg_scores: try: st2, ed2, score2 = find_where(lambda x: x[1] == ed, seg_scores2) assert ed == ed2 assert st < st2 tokens = tokenizer.convert_ids_to_tokens(doc[st:st2]) diff = score - score2 ss = ScoredPiece(st, st2, diff, tokens) ss_list.append(ss) except StopIteration: pass outputs.append((probs, probs2, query_doc_id1, ss_list)) html = HtmlVisualizer("windowed.html") for probs, probs2, query_doc_id, ss_list in outputs: html.write_paragraph(str(query_doc_id)) html.write_paragraph("Query: " + query) ss_list.sort(key=lambda ss: ss.st) prev_end = None cells = [] prob_str1 = lmap(two_digit_float, probs) prob_str1 = ["8.88"] + prob_str1 prob_str2 = lmap(two_digit_float, probs2) html.write_paragraph(" ".join(prob_str1)) html.write_paragraph(" ".join(prob_str2)) for ss in ss_list: if prev_end is not None: assert prev_end == ss.st else: print(ss.st) score = abs(int(100 * ss.score)) color = "B" if score > 0 else "R" cells.extend( [Cell(t, score, target_color=color) for t in ss.tokens]) prev_end = ss.ed html.multirow_print(cells)
def make_cell(subword: Subword): if subword in highlight_terms: return Cell(subword, highlight_score=100) else: return Cell(subword)
def draw(in_file, out_file): filename = os.path.join(output_path, in_file) data = EstimatorPredictionViewerGosford(filename) amp = 10 html_writer = HtmlVisualizer(out_file, dark_mode=False) tokenizer = get_tokenizer() for inst_i, entry in enumerate(data): if inst_i > 100: break input_ids = entry.get_vector("input_ids") tokens = tokenizer.convert_ids_to_tokens(input_ids) #tokens = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") proximity = (1 - (prob1 - prob2)) difficulty = np.power(1 - prob1, 0.3) scores = proximity * difficulty prob_scores = probabilty(scores, amp) prob_strs = ["{:06.6f}".format(v * 1000) for v in prob_scores] def normalize(prob): # 0-> Good # -1 -> Bad return prob * 1000 * 25 norm_scores = lmap(normalize, prob_scores) cells = data.cells_from_tokens(tokens, norm_scores, stop_at_pad=False) cells2 = data.cells_from_anything(prob1, norm_scores) cells3 = data.cells_from_anything(prob2, norm_scores) cells31 = data.cells_from_anything(difficulty, norm_scores) cells4 = data.cells_from_anything(scores, norm_scores) cells5 = data.cells_from_anything(prob_strs, norm_scores) row1 = [Cell("TEXT:")] row2 = [Cell("prob1:")] row3 = [Cell("prob2:")] row31 = [Cell("difficulty:")] row4 = [Cell("priority:")] row5 = [Cell("Mask Prob")] for idx, cell in enumerate(cells): row1.append(cell) row2.append(cells2[idx]) row3.append(cells3[idx]) row31.append(cells31[idx]) row4.append(cells4[idx]) row5.append(cells5[idx]) if len(row1) == 20: html_writer.write_table([row1, row2, row3, row31, row4, row5]) row1 = [Cell("TEXT:")] row2 = [Cell("prob1:")] row3 = [Cell("prob2:")] row31 = [Cell("difficulty:")] row4 = [Cell("priority:")] row5 = [Cell("Mask Prob")]