def view_grad_overlap_hidden(): filename = "ukp_feature_overlap.pickle" obj = pickle.load(open(os.path.join(output_path, filename), "rb")) out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() h_overlap = entry.get_vector('h_overlap') std = np.std(h_overlap, axis=2) mean = np.mean(h_overlap, axis=2) h_overlap = np.sum(h_overlap, axis=2) highlight = lmap(is_mask, tokens) cells = data.cells_from_tokens(tokens, highlight) rows = [cells] for layer_i in range(12): e = h_overlap[layer_i, :] e = [v * 1e6 for v in e] cells = data.cells_from_scores(e) rows.append(cells) e = [v * 1e8 for v in std[layer_i, :]] cells2 = data.cells_from_scores(e) rows.append(cells2) print(entry.get_vector("masked_lm_example_loss")) html_writer.multirow_print_from_cells_list(rows, 40)
def draw2(in_file, out_file): filename = os.path.join(output_path, in_file) data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer(out_file, dark_mode=False) tokenizer = get_tokenizer() for inst_i, entry in enumerate(data): if inst_i > 100: break tokens = entry.get_tokens("input_ids") # tokens = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") real_loss1 = entry.get_vector("per_example_loss1") real_loss2 = entry.get_vector("per_example_loss2") masked_lm_positions = entry.get_vector("masked_lm_positions") for i, loc in enumerate(masked_lm_positions): tokens[loc] = "[{}:{}]".format(i, tokens[loc]) html_writer.multirow_print(data.cells_from_tokens(tokens)) row2 = [Cell("prob1:")] + data.cells_from_anything(prob1) row3 = [Cell("prob2:")] + data.cells_from_anything(prob2) row4 = [Cell("real_loss1:")] + data.cells_from_anything(real_loss1) row5 = [Cell("real_loss2:")] + data.cells_from_anything(real_loss2) html_writer.multirow_print_from_cells_list([row2, row3, row4, row5])
def draw(): #name = "pc_para_D_grad" name = "pc_para_I_grad" #name = "pc_para_H_grad" data = EstimatorPredictionViewerGosford(name) html_writer = HtmlVisualizer(name + ".html", dark_mode=False) for inst_i, entry in enumerate(data): tokens = entry.get_tokens("input_ids") grad = entry.get_vector("gradient") m = min(grad) cells = data.cells_from_tokens(tokens) for i, cell in enumerate(cells): cells[i].highlight_score = min(abs(grad[i]) * 1e4, 255) cells[i].target_color = "B" if grad[i] > 0 else "R" print(grad) prob = softmax(entry.get_vector("logits")) pred = np.argmax(prob) label = entry.get_vector("labels") html_writer.write_paragraph("Label={} / Pred={}".format(str(label), pred)) html_writer.multirow_print(cells)
def view_grad_overlap_per_mask(): filename = "ukp_lm_probs.pickle" out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) tokenizer = data.tokenizer for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() highlight = lmap(is_mask, tokens) scores = entry.get_vector("overlap_score") pos_list = entry.get_vector("masked_lm_positions") probs = entry.get_vector("masked_lm_log_probs") probs = np.reshape(probs, [20, -1]) rows = [] for score, position, prob in zip(scores, pos_list, probs): tokens[position] = "{}-".format(position) + tokens[position] row = [Cell(position), Cell(score)] for idx in np.argsort(prob)[::-1][:5]: term = tokenizer.inv_vocab[idx] p = math.exp(prob[idx]) row.append(Cell(term)) row.append(Cell(p)) rows.append(row) cells = data.cells_from_tokens(tokens, highlight) for score, position in zip(scores, pos_list): cells[position].highlight_score = score / 10000 * 255 html_writer.multirow_print(cells, 20) html_writer.write_table(rows)
def per_doc_score(): filename = "tlm_view.pickle" html_writer = HtmlVisualizer("per_doc_score.html", dark_mode=False) data = EstimatorPredictionViewerGosford(filename) amp = 20 small_threshold = 40 for inst_i, entry in enumerate(data): if inst_i > 1000: break scores = entry.get_vector("priority_score") tokens = entry.get_mask_resolved_input_mask_with_input() cells = data.cells_from_tokens(tokens) if len(cells) < small_threshold: continue avg_score = average(scores) if -0.11 > avg_score > -0.30: continue print(average(scores)) html_writer.write_headline(avg_score) rows = [] row = [] for idx, cell in enumerate(cells): row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = []
def view_grad_overlap(): filename = "gradient_overlap_4K.pickle" out_name = filename.split(".")[0] + ".html" html_writer = HtmlVisualizer(out_name, dark_mode=False) data = EstimatorPredictionViewerGosford(filename) iba = IntBinAverage() scores = [] for inst_i, entry in enumerate(data): masked_lm_example_loss = entry.get_vector("masked_lm_example_loss") score = entry.get_vector("overlap_score") if masked_lm_example_loss > 1: norm_score = score / masked_lm_example_loss iba.add(masked_lm_example_loss, norm_score) scores.append(score) score_avg = average(scores) score_std = np.std(scores) avg = iba.all_average() std_dict = {} for key, values in iba.list_dict.items(): std_dict[key] = np.std(values) if len(values) == 1: std_dict[key] = 999 def unlikeliness(value, mean, std): return abs(value - mean) / std data = EstimatorPredictionViewerGosford(filename) print("num record : ", data.data_len) cnt = 0 for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() masked_lm_example_loss = entry.get_vector("masked_lm_example_loss") highlight = lmap(is_mask, tokens) score = entry.get_vector("overlap_score") print(score) cells = data.cells_from_tokens(tokens, highlight) if masked_lm_example_loss > 1: bin_key = int(masked_lm_example_loss) norm_score = score / masked_lm_example_loss if norm_score > 5000: cnt += 1 expectation = avg[bin_key] if unlikeliness(score, score_avg, score_std) > 2 or True: html_writer.multirow_print(cells, 20) if norm_score > expectation: html_writer.write_paragraph("High") else: html_writer.write_paragraph("Low") html_writer.write_paragraph("Norm score: " + str(norm_score)) html_writer.write_paragraph("score: " + str(score)) html_writer.write_paragraph("masked_lm_example_loss: " + str(masked_lm_example_loss)) html_writer.write_paragraph("expectation: " + str(expectation)) print("number over 5000: ", cnt)
def bert_baseline_repeat(): info = load_from_pickle("eHealth_test_info") for i in [3,4,5]: prediction_name = "eHealth_bert_freeze_{}".format(i) pred_data = EstimatorPredictionViewerGosford(prediction_name) out_path = pjoin(subdir_root, "bert_baseline_{}.txt".format(i)) prediction_to_ranked_list(pred_data, info, out_path)
def statistics_tlm(): filename = "blc_cold_scores.pickle" data = EstimatorPredictionViewerGosford(filename) bins = {} bin_fn = get_bin_fn_from_interval(0, 1.05, 0.05) for inst_i, entry in enumerate(data): loss1 = entry.get_vector("lm_loss1") loss2 = entry.get_vector("lm_loss2") prob1 = loss_to_prob(loss1) prob2 = loss_to_prob(loss2) tokens = entry.get_mask_resolved_input_mask_with_input() for i, _ in enumerate(tokens): key = bin_fn(prob1[i]) if key not in bins: bins[key] = [] bins[key].append(prob2[i]) keys = list([k for k in bins.keys() if not k == "Unidentifed"]) keys.sort(key=lambda x:x[0]) mean_dict = {} std_dict = {} for key in keys: l = average(bins[key]) std = np.std(bins[key]) mean_dict[key] = l std_dict[key] = std st, ed = key #print("{0:.2f} {1:.2f}".format(st, ed), l) return bin_fn, mean_dict, std_dict
def compare_grad_overlap(): filename = "gradient_overlap_4K.pickle" data = EstimatorPredictionViewerGosford(filename) filename2 = "ukp_lm_overlap.pickle" data2 = EstimatorPredictionViewerGosford(filename2) scores1 = data.vectors["overlap_score"] scores2 = data2.vectors["overlap_score"] print(scores1.shape) def summary(scores): score_avg = average(scores) score_std = np.std(scores) print(score_avg, score_std) summary(scores1) summary(scores2)
def view(): filename = os.path.join(output_path, "nli_dev_loss.pickle") data = EstimatorPredictionViewerGosford(filename) loss_arr = [] for inst_i, entry in enumerate(data): t = entry.get_vector("loss") loss_arr.append(float(t)) print(len(loss_arr)) print("avg:", average(loss_arr))
def per_doc_score(): filename = "fetch_hidden_dim.pickle" html_writer = HtmlVisualizer("preserved.html", dark_mode=False) p = os.path.join(output_path, filename) raw_data = pickle.load(open(p, "rb")) n_skip = 0 data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): if inst_i > 100: break count_preserved = entry.get_vector("layer_count") tokens = entry.get_tokens("input_ids") cells = data.cells_from_tokens(tokens) valid_parst = count_preserved[:len(cells)] avg = np.average(count_preserved) row = [] row2 = [] #f_print = avg > 20 f_print = True print(avg) if f_print: html_writer.write_paragraph("Skipped {} articles".format(n_skip)) n_skip = 0 for idx, cell in enumerate(cells): score = count_preserved[idx] / 728 * 100 cell.highlight_score = score row.append(cell) row2.append((Cell(count_preserved[idx], score))) if len(row) == 20: html_writer.write_table([row, row2]) row = [] row2 = [] html_writer.write_paragraph(str(avg)) else: n_skip += 1
def loss_view(): filename = "sero_pred.pickle" p = os.path.join(output_path, filename) data = pickle.load(open(p, "rb")) print(data[0]["masked_lm_example_loss"].shape) print(data[0]["masked_input_ids"].shape) html_writer = HtmlVisualizer("sero_pred.html", dark_mode=False) data = EstimatorPredictionViewerGosford(filename) for inst_i, entry in enumerate(data): losses = entry.get_vector("masked_lm_example_loss") print(losses) tokens = entry.get_tokens("masked_input_ids") cells = data.cells_from_tokens(tokens) row = [] for idx, cell in enumerate(cells): row.append(cell) if len(row) == 20: html_writer.write_table([row]) row = [] html_writer.multirow_print(data.cells_from_anything(losses), 20)
def lexical_tendency(): filename = "nli_lm_feature_overlap\\11.pickle" print("Loading file") data = EstimatorPredictionViewerGosford(filename) print("Done") dva_list = list([DictValueAverage() for _ in range(12)]) for inst_i, entry in enumerate(data): tokens = entry.get_mask_resolved_input_mask_with_input() h_overlap = entry.get_vector('h_overlap') #[num_layer, seq_length, hidden_dim] for layer_i in range(12): scores = h_overlap[layer_i, :] for loc in range(1, 512): bigram = combine(tokens[loc-1], tokens[loc]) if math.isnan(scores[loc]) or math.isinf(scores[loc]): pass else: dva_list[layer_i].add(bigram, scores[loc]) print("Total data:", data.data_len) for layer_i in range(12): all_avg = dva_list[layer_i].all_average() l = list(all_avg.items()) l.sort(key=lambda x:x[1], reverse=True) print("Layer : ", layer_i) print_cnt = 0 print("Top-k") for k, v in l: if dva_list[layer_i].cnt_dict[k] > 10: print(k, v) print_cnt += 1 if print_cnt > 10: break print("Low-k") print_cnt = 0 for k, v in l[::-1]: if dva_list[layer_i].cnt_dict[k] > 100: print(k, v) print_cnt += 1 if print_cnt > 10: break
def get_correctness(filename, file_path): itr = load_record_v2(file_path) data = EstimatorPredictionViewerGosford(filename) correctness = [] for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if pred == label: correctness.append(1) else: correctness.append(0) return correctness
def show(filename): data = EstimatorPredictionViewerGosford(filename) html_writer = HtmlVisualizer("token_scoring.html", dark_mode=False) correctness = [] for entry in data: tokens = entry.get_tokens("input_ids") logits = entry.get_vector("logits") masks = entry.get_vector("label_masks") ids = entry.get_vector("labels") token_row = [] pred_row = [] gold_row = [] rows = [token_row, pred_row, gold_row] for idx, token in enumerate(tokens): token_cell = Cell(token) if token == "[PAD]": break model_score = logits[idx][0] if masks[idx]: correct = (model_score > 0 and ids[idx] > 0) or (model_score < 0 and ids[idx] < 0) color = "B" if correct else "R" if correct and (model_score > 0 and ids[idx] > 0) : color = "G" pred_cell = Cell("{0:.2f}".format(model_score), 100, target_color=color) gold_cell = Cell("{0:.2f}".format(ids[idx]), 100, target_color=color) else: token_cell = Cell(token) pred_cell = Cell("") gold_cell = Cell("") token_row.append(token_cell) pred_row.append(pred_cell) gold_row.append(gold_cell) html_writer.multirow_print_from_cells_list(rows, 20)
def show_prediction(filename, file_path, correctness_1, correctness_2): data = EstimatorPredictionViewerGosford(filename) itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(filename) html = HtmlVisualizer(name + ".html") idx = 0 for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if not correctness_1[idx] or not correctness_2[idx]: html.write_paragraph("Label : {} Correct: {}/{}".format( label, correctness_1[idx], correctness_2[idx])) html.write_table([p_cells]) html.write_table([h_cells]) idx += 1
def clueweb12_13A(): prediction_name = "eHealth_pred.clueweb_12_13A" pred_data = EstimatorPredictionViewerGosford(prediction_name) info = load_from_pickle("eHealth_test_info") out_path = pjoin(subdir_root , "clef1_C.txt") prediction_to_ranked_list(pred_data, info, out_path)
def draw(in_file, out_file): filename = os.path.join(output_path, in_file) data = EstimatorPredictionViewerGosford(filename) amp = 10 html_writer = HtmlVisualizer(out_file, dark_mode=False) tokenizer = get_tokenizer() for inst_i, entry in enumerate(data): if inst_i > 100: break input_ids = entry.get_vector("input_ids") tokens = tokenizer.convert_ids_to_tokens(input_ids) #tokens = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") proximity = (1 - (prob1 - prob2)) difficulty = np.power(1 - prob1, 0.3) scores = proximity * difficulty prob_scores = probabilty(scores, amp) prob_strs = ["{:06.6f}".format(v * 1000) for v in prob_scores] def normalize(prob): # 0-> Good # -1 -> Bad return prob * 1000 * 25 norm_scores = lmap(normalize, prob_scores) cells = data.cells_from_tokens(tokens, norm_scores, stop_at_pad=False) cells2 = data.cells_from_anything(prob1, norm_scores) cells3 = data.cells_from_anything(prob2, norm_scores) cells31 = data.cells_from_anything(difficulty, norm_scores) cells4 = data.cells_from_anything(scores, norm_scores) cells5 = data.cells_from_anything(prob_strs, norm_scores) row1 = [Cell("TEXT:")] row2 = [Cell("prob1:")] row3 = [Cell("prob2:")] row31 = [Cell("difficulty:")] row4 = [Cell("priority:")] row5 = [Cell("Mask Prob")] for idx, cell in enumerate(cells): row1.append(cell) row2.append(cells2[idx]) row3.append(cells3[idx]) row31.append(cells31[idx]) row4.append(cells4[idx]) row5.append(cells5[idx]) if len(row1) == 20: html_writer.write_table([row1, row2, row3, row31, row4, row5]) row1 = [Cell("TEXT:")] row2 = [Cell("prob1:")] row3 = [Cell("prob2:")] row31 = [Cell("difficulty:")] row4 = [Cell("priority:")] row5 = [Cell("Mask Prob")]
def main(): prediction_name = "eHealth_pred" pred_data = EstimatorPredictionViewerGosford(prediction_name) info = load_from_pickle("eHealth_test_info") out_path = pjoin(subdir_root , "eHealth_list.txt") prediction_to_ranked_list(pred_data, info, out_path)
def bert_baseline(): prediction_name = "eHealth_bert_freeze" pred_data = EstimatorPredictionViewerGosford(prediction_name) info = load_from_pickle("eHealth_test_info") out_path = pjoin(subdir_root , "bert_baseline.txt") prediction_to_ranked_list(pred_data, info, out_path)
def do(): pred_file_name = "RLPP_0.pickle" pred_file_name = "ukp_rel.pickle" record_file_name = "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0" record_file_name = "C:\\work\\Code\\Chair\\output\\tf_enc" todo = [ ("RLPP_0.pickle", "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0", "RLPP_wiki.html"), ("ukp_rel.pickle", "C:\\work\\Code\\Chair\\output\\tf_enc", "RLPP_ukp.html") ] x = [] y = [] for pred_file_name, record_file_name, out_name in todo: viewer = EstimatorPredictionViewerGosford(pred_file_name) html = HtmlVisualizer(out_name) itr1 = load_record_v2(record_file_name) itr2 = viewer.__iter__() cnt = 0 for features, entry in zip(itr1, itr2): cnt += 1 if cnt > 200: break input_ids1 = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") cells = viewer.cells_from_tokens(input_ids1) p1_l = [] p2_l = [] useful_l = [] row1 = [] row2 = [] row3 = [] row4 = [] for j, cell in enumerate(cells): p1 = float(prob1[j]) p2 = float(prob2[j]) x.append([p1]) y.append(p2) u = useful(p1, p2) score = (1 - u) * 100 cell.highlight_score = score row1.append(cell) row2.append(Cell(p1, score)) row3.append(Cell(p2, score)) row4.append(Cell(u, score)) p1_l.append(p1) p2_l.append(p2) useful_l.append(u) if len(row1) > 20: rows = [row1, row2, row3, row4] row1 = [] row2 = [] row3 = [] row4 = [] html.write_table(rows) html.write_paragraph("p1: {}".format(average(p1_l))) html.write_paragraph("p2: {}".format(average(p2_l))) html.write_paragraph("useful: {}".format(average(useful_l))) if average(useful_l) < 0.4: html.write_headline("Low Score") l = list(zip(x, y)) random.shuffle(l) l = l[:1000] x, y = zip(*l) lin = LinearRegression() lin.fit(x, y) poly = PolynomialFeatures(degree=4) X_poly = poly.fit_transform(x) poly.fit(X_poly, y) lin2 = LinearRegression() lin2.fit(X_poly, y) plt.scatter(x, y, color='blue') plt.plot(x, lin2.predict(poly.fit_transform(x)), color='red') plt.title('Polynomial Regression') plt.show()
def doit(filename): name = filename.split(".")[0] bin_fn, mean_d, std_d = statistics_tlm() def get_score(p1, p2): key = bin_fn(p1) v = min(p2, p1) return (v - mean_d[key]) / std_d[key] st_list = [] ed_list = [] std_list = [] mean_list = [] for key in mean_d: st, ed = key st_list.append(st) ed_list.append(ed) std_list.append(std_d[key]) mean_list.append(mean_d[key]) mean_list = np.expand_dims(np.array(mean_list), 0) std_list = np.expand_dims(np.array(std_list), 0) st_list = np.expand_dims(np.array(st_list), 0) ed_list = np.expand_dims(np.array(ed_list), 0) def get_scores_lin(prob1_list, prob2_list): v2 = np.min(np.stack([prob1_list, prob2_list], axis=1), axis=1) v2 = np.expand_dims(v2, 1) all_scores = (v2 - mean_list) / std_list prob1_list = np.expand_dims(prob1_list, 1) f1 = np.less_equal(st_list, prob1_list) f2 = np.less(prob1_list, ed_list) f = np.logical_and(f1, f2) all_scores = all_scores * f scores = np.sum(all_scores, axis=1) return scores data = EstimatorPredictionViewerGosford(filename) amp = 0.5 html_writer = HtmlVisualizer("{}_{}.html".format(name, amp), dark_mode=False) for inst_i, entry in enumerate(data): if inst_i > 10: break tokens = entry.get_mask_resolved_input_mask_with_input() scores = entry.get_vector("priority_score") loss1 = entry.get_vector("lm_loss1") loss2 = entry.get_vector("lm_loss2") #scores1 = get_scores_lin(loss_to_prob(loss1), loss_to_prob(loss2)) #scores = [get_score(v1, v2) for v1,v2 in zip(loss_to_prob(loss1), loss_to_prob(loss2))] #assert np.all(np.less(np.abs(scores - scores1), 0.01)) prob_scores = probabilty(scores, amp) prob_strs = ["{:06.6f}".format(v*1000) for v in prob_scores] def normalize(prob): # 0-> Good # -1 -> Bad return min(prob * 10000, 100) norm_scores = lmap(normalize, prob_scores) cells = data.cells_from_tokens(tokens, norm_scores) cells2 = data.cells_from_anything(scores, norm_scores) cells3 = data.cells_from_anything(prob_strs, norm_scores) cells4 = data.cells_from_anything(loss_to_prob(loss1), norm_scores) cells5 = data.cells_from_anything(loss_to_prob(loss2), norm_scores) html_writer.multirow_print_from_cells_list([cells, cells2, cells3, cells4, cells5]) html_writer.write_headline("")