Exemple #1
0
def show_tfrecord(file_path):

    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(file_path)
    html = HtmlVisualizer(name + ".html")
    for features in itr:
        input_ids = take(features["input_ids"])
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]

        html.write_paragraph("Label : {}".format(label))
        html.write_table([p_cells])
        html.write_table([h_cells])
Exemple #2
0
def show(out_file_name, summarized_table: List[Entry]):
    html = HtmlVisualizer(out_file_name)
    tokenizer = get_tokenizer()
    num_print = 0
    for input_ids, prob, contributions in summarized_table:
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        html.write_paragraph("Score : {}".format(prob))
        cells = []
        max_change = 0
        for idx in range(len(input_ids)):
            token = tokens[idx]
            if token == "[PAD]":
                break
            if idx in contributions:
                raw_score = contributions[idx]
                max_change = max(abs(raw_score), max_change)

                score = abs(raw_score) * 100
                color = "R" if raw_score > 0 else "B"
                c = Cell(token, highlight_score=score, target_color=color)
            else:
                c = Cell(token, highlight_score=150, target_color="Gray")
            cells.append(c)

        if max_change < 0.05:
            pass
        else:
            html.multirow_print(cells, 30)
            num_print += 1

    print("printed {} of {}".format(num_print, len(summarized_table)))
Exemple #3
0
def print_param():
    p_base = load_param(get_bert_full_path())
    nli_path = "C:\work\Code\Chair\output\model\\runs\\nli_model.ckpt-75000_NLI\\model-0"
    p_ft = load_param(nli_path)
    keys = list(p_base.keys())

    key = "bert/encoder/layer_0/output/dense/kernel"
    param1 = p_base[key]
    param2 = p_ft[key]
    html = HtmlVisualizer("bert_dense_param.html")

    l , c = param1.shape

    s_score = 100
    for i in range(l):
        rows = []
        row1 = []
        row2 = []
        s_score = 100 - s_score
        score = s_score
        for j in range(c):
            score = 100 - score
            row1.append(Cell("{0:.4f}".format(param1[i, j]), score))
            row2.append(Cell("{0:.4f}".format(param2[i, j]), score))
        rows.append(row1)
        rows.append(row2)
        html.write_table(rows)
def draw2(in_file, out_file):
    filename = os.path.join(output_path, in_file)
    data = EstimatorPredictionViewerGosford(filename)
    html_writer = HtmlVisualizer(out_file, dark_mode=False)

    tokenizer = get_tokenizer()
    for inst_i, entry in enumerate(data):
        if inst_i > 100:
            break

        tokens = entry.get_tokens("input_ids")
        # tokens = entry.get_tokens("input_ids")
        prob1 = entry.get_vector("prob1")
        prob2 = entry.get_vector("prob2")
        real_loss1 = entry.get_vector("per_example_loss1")
        real_loss2 = entry.get_vector("per_example_loss2")

        masked_lm_positions = entry.get_vector("masked_lm_positions")

        for i, loc in enumerate(masked_lm_positions):

            tokens[loc] = "[{}:{}]".format(i, tokens[loc])

        html_writer.multirow_print(data.cells_from_tokens(tokens))

        row2 = [Cell("prob1:")] + data.cells_from_anything(prob1)
        row3 = [Cell("prob2:")] + data.cells_from_anything(prob2)
        row4 = [Cell("real_loss1:")] + data.cells_from_anything(real_loss1)
        row5 = [Cell("real_loss2:")] + data.cells_from_anything(real_loss2)
        html_writer.multirow_print_from_cells_list([row2, row3, row4, row5])
def run():
    tokenizer = get_tokenizer()
    spr = StreamPickleReader("contradiction_prediction")

    html = HtmlVisualizer("contradiction_prediction.html")
    cnt = 0
    while spr.has_next():
        item = spr.get_item()
        e, p = item
        input_ids, _, _ = e
        logit, explain = p
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        p, h = split_p_h_with_input_ids(tokens, input_ids)
        p_score, h_score = split_p_h_with_input_ids(explain, input_ids)

        p_score = normalize(p_score)
        h_score = normalize(h_score)
        p_cells = [Cell("P:")] + cells_from_tokens(p, p_score)
        h_cells = [Cell("H:")] + cells_from_tokens(h, h_score)

        html.write_paragraph(str(logit))
        html.multirow_print(p_cells)
        html.multirow_print(h_cells)

        if cnt > 100:
            break
        cnt += 1
Exemple #6
0
def view_grad_overlap_per_mask():
    filename = "ukp_lm_probs.pickle"

    out_name = filename.split(".")[0] + ".html"
    html_writer = HtmlVisualizer(out_name, dark_mode=False)
    data = EstimatorPredictionViewerGosford(filename)
    tokenizer = data.tokenizer
    for inst_i, entry in enumerate(data):
        tokens = entry.get_mask_resolved_input_mask_with_input()
        highlight = lmap(is_mask, tokens)
        scores = entry.get_vector("overlap_score")
        pos_list = entry.get_vector("masked_lm_positions")
        probs = entry.get_vector("masked_lm_log_probs")
        probs = np.reshape(probs, [20, -1])
        rows = []
        for score, position, prob in zip(scores, pos_list, probs):
            tokens[position] = "{}-".format(position) + tokens[position]

            row = [Cell(position), Cell(score)]

            for idx in np.argsort(prob)[::-1][:5]:
                term = tokenizer.inv_vocab[idx]
                p = math.exp(prob[idx])
                row.append(Cell(term))
                row.append(Cell(p))
            rows.append(row)

        cells = data.cells_from_tokens(tokens, highlight)
        for score, position in zip(scores, pos_list):
            cells[position].highlight_score = score / 10000 * 255

        html_writer.multirow_print(cells, 20)
        html_writer.write_table(rows)
Exemple #7
0
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    claims = get_all_claims()
    claim_d = claims_to_dict(claims)

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:10]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = remove_duplicate(list([e.doc_id
                                                    for e in entries]))
        claim = claim_d[int(query_id)]
        s = "{} : {}".format(query_id, claim)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer("claim_docs_urls.html")
    html.write_table(rows)
def main(config):
    # select claims
    # load relevant documents
    # remove duplicate
    q_res_path = config['q_res_path']
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_text_d = json.load(open(config['query_text_d']))
    save_name = config['save_path']

    keys = list(ranked_list.keys())
    keys.sort()
    num_doc_per_query = 10
    url_prefix = "http://localhost:36559/document?identifier="
    rows = []
    for query_id in keys[:100]:
        entries: List[SimpleRankedListEntry] = ranked_list[query_id]
        entries = entries[:num_doc_per_query * 3]
        doc_ids: List[str] = list([e.doc_id for e in entries])
        query_text = query_text_d[query_id]
        s = "{} : {}".format(query_id, query_text)
        rows.append([Cell(s)])
        for doc_id in doc_ids[:num_doc_per_query]:
            url = url_prefix + doc_id
            s = "<a href=\"{}\">{}</a>".format(url, doc_id)
            rows.append([Cell(s)])

    html = HtmlVisualizer(save_name)
    html.write_table(rows)
Exemple #9
0
 def get_cell(score):
     if score == "-":
         return Cell("-")
     else:
         # 0.01 -> 100
         norm_score = min(abs(score) * 10000, 100)
         color = "B" if score > 0 else "R"
         return Cell("", highlight_score=norm_score, target_color=color)
Exemple #10
0
def show_simple(run_name, data_id, tex_visulizer):
    num_tags = 3
    num_select = 20
    pickle_name = "save_view_{}_{}".format(run_name, data_id)
    tokenizer = get_tokenizer()

    data_loader = get_modified_data_loader2(HPSENLI3(), NLIExTrainConfig())

    explain_entries = load_from_pickle(pickle_name)
    explain_entries = explain_entries

    selected_instances = [[], [], []]
    for idx, entry in enumerate(explain_entries):
        x0, logits, scores = entry

        pred = np.argmax(logits)
        input_ids = x0
        p, h = data_loader.split_p_h_with_input_ids(input_ids, input_ids)
        p_tokens = tokenizer.convert_ids_to_tokens(p)
        h_tokens = tokenizer.convert_ids_to_tokens(h)

        p_tokens = restore_capital_letter(p_tokens)
        h_tokens = restore_capital_letter(h_tokens)

        target_tag = ["match", "mismatch", "conflict"][pred]

        tag_idx = data_generator.NLI.nli_info.tags.index(target_tag)
        tag_name = data_generator.NLI.nli_info.tags[tag_idx]
        p_score, h_score = data_loader.split_p_h_with_input_ids(scores[tag_idx], input_ids)
        normalize_fn = normalize
        p_score = normalize_fn(p_score)
        h_score = normalize_fn(h_score)
        p_row = [Cell("\\textbf{P:}")] + cells_from_tokens(p_tokens, p_score)
        h_row = [Cell("\\textbf{H:}")] + cells_from_tokens(h_tokens, h_score)

        pred_str = ["entailment", "neutral" , "contradiction"][pred]
        apply_color(p_row, tag_name)
        apply_color(h_row, tag_name)
        #tex_visulizer.write_paragraph(pred_str)
        if len(selected_instances[pred]) < num_select :
            e = pred_str, [p_row, h_row]
            #tex_visulizer.write_instance(pred_str, gold_label, [p_row, h_row])
            selected_instances[pred].append(e)

        if all([len(s) == num_select for s in selected_instances]):
            break

    for insts in selected_instances:
        for e in insts:
            pred_str, rows = e
            tex_visulizer.write_instance(pred_str, rows)

    return selected_instances
Exemple #11
0
def get_cell_from_token(token, log_odd):
    if token in stopwords:
        log_odd = 0

    if log_odd > 0:
        s = min(150, log_odd * 50)
        c = Cell(token, s, target_color="B")
    elif log_odd < 0:
        s = min(150, -log_odd * 50)
        c = Cell(token, s, target_color="R")
    else:
        c = Cell(token)
    return c
Exemple #12
0
def visualize_prediction_data(data_id):
    tokenizer = get_tokenizer()
    num_samples_list = open(
        os.path.join(working_path, "entry_prediction_n", data_id),
        "r").readlines()
    p = os.path.join(working_path, "entry_loss",
                     "entry{}.pickle".format(data_id))
    loss_outputs_list = pickle.load(open(p, "rb"))
    print("Loaded input data")
    loss_outputs = []
    for e in loss_outputs_list:
        loss_outputs.extend(e["masked_lm_example_loss"])
    print("Total of {} loss outputs".format(len(loss_outputs)))
    instance_idx = 0
    feature_itr = load_record_v2(
        os.path.join(working_path, "entry_prediction_tf.done", data_id))
    n = len(num_samples_list)
    n = 100
    html = HtmlVisualizer("entry_prediction.html")
    for i in range(n):
        n_sample = int(num_samples_list[i])
        assert n_sample > 0
        first_inst = feature_itr.__next__()
        feature = Feature2Text(first_inst, tokenizer)

        html.write_headline("Input:")
        html.write_paragraph(feature.get_input_as_text(True, True))
        html.write_headline("Word:" + feature.get_selected_word_text())

        if instance_idx + n_sample >= len(loss_outputs):
            break

        if n_sample == 1:
            continue

        rows = []
        no_dict_loss = loss_outputs[instance_idx]
        row = [Cell(no_dict_loss, 0), Cell("")]
        rows.append(row)
        instance_idx += 1
        for j in range(1, n_sample):
            feature = Feature2Text(feature_itr.__next__(), tokenizer)
            def_cell = Cell(feature.get_def_as_text())
            loss = loss_outputs[instance_idx]
            hl_score = 100 if loss < no_dict_loss * 0.9 else 0
            row = [Cell(loss, hl_score), def_cell]
            rows.append(row)
            instance_idx += 1

        html.write_table(rows)
Exemple #13
0
 def get_cell_from_token2(token, probs):
     if token.lower() in stopwords:
         probs = 0
     probs = probs * 1e5
     s = min(100, probs)
     c = Cell(token, s)
     return c
Exemple #14
0
def cells_from_tokens(tokens, scores=None, stop_at_pad=True):
    cells = []
    for i, token in enumerate(tokens):
        if tokens[i] == "[PAD]" and stop_at_pad:
            break
        term = tokens[i]
        cont_left = term[:2] == "##"
        cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##"
        if i + 1 < len(tokens):
            dependent_right = is_dependent(tokens[i + 1])
        else:
            dependent_right = False

        dependent_left = is_dependent(tokens[i])

        if cont_left:
            term = term[2:]

        space_left = "&nbsp;" if not (cont_left or dependent_left) else ""
        space_right = "&nbsp;" if not (cont_right or dependent_right) else ""

        if scores is not None:
            score = scores[i]
        else:
            score = 0
        cells.append(Cell(term, score, space_left, space_right))
    return cells
Exemple #15
0
 def get_table(get_pair_score_at):
     head = [Cell("")] + [Cell(t) for t in p_tokens]
     rows = [head]
     for h_idx in range(h_idx_max + 1):
         row = [Cell(h_tokens[h_idx])]
         for p_idx in range(p_idx_max + 1):
             s = get_pair_score_at((p_idx, h_idx))
             one_del_score = group_summary.score_d[(p_idx, -1)]
             two_del_score = group_summary.score_d[(p_idx, h_idx)]
             tooltip_str = "{} -> {}".format(
                 float_arr_to_str_arr(one_del_score),
                 float_arr_to_str_arr(two_del_score))
             row.append(
                 get_tooltip_cell(two_digit_float(s), tooltip_str))
         rows.append(row)
     return rows
Exemple #16
0
def main():
    save_name = sys.argv[1]
    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, "cppnc_triple_all_dev_info")
    pred_file_path = os.path.join(out_dir, save_name + ".score")
    cid_and_confidences = get_confidence_list_per_cid(info_file_path, pred_file_path)

    rows = []
    for cid, confidenc_list in cid_and_confidences.items():
        row = list()
        row.append(Cell(str(cid)))
        row.extend([Cell("", highlight_score=c*100) for c in confidenc_list])
        rows.append(row)

    html = HtmlVisualizer("confidence.html")
    html.write_table(rows)
Exemple #17
0
def main():
    save_name = "alamri_mismatch_all"
    output_d = load_from_pickle(save_name)
    html = HtmlVisualizer("alamri_mismatch.html")
    tokenizer = get_tokenizer()
    logits_grouped_by_layer = output_d["per_layer_logits"]
    num_layers = 12

    def float_arr_to_cell(head, float_arr):
        return [Cell(head)] + lmap(Cell, map(two_digit_float, float_arr))

    def float_arr_to_cell2(head, float_arr):
        return [Cell(head)] + lmap(Cell, map("{0:.4f}".format, float_arr))

    num_data = len(output_d['input_ids'])
    for data_idx in range(num_data)[:100]:

        def get(name):
            return output_d[name][data_idx]

        tokens = tokenizer.convert_ids_to_tokens(get("input_ids"))
        ex_scores = get('ex_scores')
        probs = scipy.special.softmax(get('logits'))

        pred_str = make_prediction_summary_str(probs)

        html.write_paragraph("Prediction: {}".format(pred_str))
        html.write_paragraph("gold label={}".format(get("label")))

        row1 = [Cell("")] + list(
            [Cell(t, int(s * 100)) for t, s in zip(tokens, ex_scores)])
        row2 = float_arr_to_cell("ex_prob", ex_scores)
        for i, s in enumerate(ex_scores):
            if s > 0.5:
                row2[i + 1].highlight_score = 100

        rows = [row1, row2]

        for layer_no in range(num_layers):
            layer_logit = logits_grouped_by_layer[layer_no][data_idx]
            probs = sigmoid(layer_logit)
            row = float_arr_to_cell("layer_{}".format(layer_no), probs[:, 1])
            rows.append(row)

        html.write_table(rows)
Exemple #18
0
        def get_cells(tokens, scores):
            cap = max(max(scores), 1)
            factor = 100 / cap

            def normalize_score(s):
                return min(s * factor, 100)

            return list(
                [Cell(t, normalize_score(s)) for t, s in zip(tokens, scores)])
Exemple #19
0
def main():
    #claim_d = load_train_claim_d()
    html = HtmlVisualizer("doc_relevance_and_value.html")
    rows = []
    data_id = 0
    for query, k_list in load_qk():
        claim_id = query.query_id
        claim_text = query.text

        doc_ids = set([k.doc_id for k in k_list])
        for doc_id in list(doc_ids)[:10]:
            url = os.path.join(output_path, "pc_docs_html", doc_id + ".html")
            a = "<a href=\"{}\">url</a>".format(url)
            #tab_print(data_id, claim_id, doc_id)
            row = [Cell(data_id), Cell(claim_id), Cell(claim_text), Cell(a)]
            rows.append(row)
        data_id += 1

    html.write_table(rows)
Exemple #20
0
            def format_scores(raw_scores):
                def format_float(s):
                    return "{0:.2f}".format(s)

                norm_scores = normalize_fn(raw_scores)

                cells = [Cell(format_float(s1), s2, False, False) for s1, s2 in zip(raw_scores, norm_scores)]
                if tag_name == "mismatch":
                    set_cells_color(cells, "G")
                elif tag_name == "conflict":
                    set_cells_color(cells, "R")
                return cells
Exemple #21
0
def show_prediction(filename, file_path, correctness_1, correctness_2):

    data = EstimatorPredictionViewerGosford(filename)
    itr = load_record_v2(file_path)
    tokenizer = get_tokenizer()
    name = os.path.basename(filename)
    html = HtmlVisualizer(name + ".html")
    idx = 0
    for entry in data:
        features = itr.__next__()

        input_ids = entry.get_vector("input_ids")
        input_ids2 = take(features["input_ids"])
        assert np.all(input_ids == input_ids2)
        alt_emb_mask = take(features["alt_emb_mask"])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids)
        p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids)

        p_cells = [
            Cell(p_tokens[i], 100 if p_mask[i] else 0)
            for i in range(len(p_tokens))
        ]
        h_cells = [
            Cell(h_tokens[i], 100 if h_mask[i] else 0)
            for i in range(len(h_tokens))
        ]

        label = take(features["label_ids"])[0]
        logits = entry.get_vector("logits")
        pred = np.argmax(logits)

        if not correctness_1[idx] or not correctness_2[idx]:
            html.write_paragraph("Label : {} Correct: {}/{}".format(
                label, correctness_1[idx], correctness_2[idx]))
            html.write_table([p_cells])
            html.write_table([h_cells])

        idx += 1
Exemple #22
0
def write_feature_to_html(feature, html, tokenizer):
    input_ids = take(feature['input_ids'])
    focus_msak = take(feature['focus_mask'])
    label_ids = take(feature['label_ids'])
    text1 = tokenizer.convert_ids_to_tokens(input_ids)

    row = []
    for i in range(len(input_ids)):
        highlight_score = 100 if focus_msak[i] else 0
        row.append(Cell(text1[i], highlight_score))

    html.write_headline("{}".format(label_ids[0]))
    html.multirow_print(row)
Exemple #23
0
def main():
    file_path = sys.argv[1]
    name = os.path.basename(file_path)
    viewer = EstimatorPredictionViewer(file_path)
    html = HtmlVisualizer("toke_score_gold.html")
    stopwords = load_stopwords_for_query()

    skip = 10
    for entry_idx, entry in enumerate(viewer):
        if entry_idx % skip != 0:
            continue
        tokens = entry.get_tokens("input_ids")
        input_ids = entry.get_vector("input_ids")
        label_ids = entry.get_vector("label_ids")
        label_ids = np.reshape(label_ids, [-1, 2])
        log_label_ids = np.log(label_ids + 1e-10)
        seg1, seg2 = split_p_h_with_input_ids(tokens, input_ids)

        pad_idx = tokens.index("[PAD]")
        assert pad_idx > 0

        logits = entry.get_vector("logits")
        cells = []
        cells2 = []
        for idx in range(pad_idx):
            probs = label_ids[idx]
            token = tokens[idx]

            score = probs[0]
            color = "B" if score > 0 else "R"
            highlight_score = min(abs(score) * 10000, 100)
            if token in stopwords:
                highlight_score = 0
            if token in seg1:
                highlight_score = 50
                color = "G"

            c = Cell(token,
                     highlight_score=highlight_score,
                     target_color=color)
            cells.append(c)
        html.multirow_print_from_cells_list([cells, cells2])

        if entry_idx > 10000:
            break
Exemple #24
0
def analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer):
    batch_size = 16
    seq_len = 200
    hidden_dim = 768
    reshaped_grad = reshape_gradienet(tt_grad, seq_len, hidden_dim, False)

    hv_tt, x_list = reshape(hv_tt)
    hv_lm, x_list = reshape(hv_lm)

    assert len(hv_lm) == len(hv_tt)

    html = HtmlVisualizer("Preserved.html")
    for inst_i in range(len(hv_lm)):
        print("\t", end="")
        tokens = tokenizer.convert_ids_to_tokens(x_list[inst_i])
        for seq_i in range(seq_len):
            token = tokenizer.convert_ids_to_tokens([x_list[inst_i, seq_i]])[0]
            print("{}".format(token), end="\t")
        print()
        scores = []
        for layer_i in range(13):
            if layer_i != 1:
                continue
            layer_no = layer_i
            if layer_no >= 1:
                print("Layer {} :".format(layer_no), end="\t")
            else:
                print("Embedding:", end="\t")
            for seq_i in range(seq_len):
                n_diff_1, n_diff_2 = diff_and_grad(
                    hv_lm[inst_i, layer_i, seq_i], hv_tt[inst_i, layer_i,
                                                         seq_i],
                    reshaped_grad[inst_i, layer_i, seq_i])
                scores.append(n_diff_1)
                print("{}({})".format(n_diff_1, n_diff_2), end="\t")
            print("\n")

        row = []
        for t, s in zip(tokens, scores):
            score = s / hidden_dim * 100
            row.append(Cell(t, score))
        html.write_table([row])
        print("-----------------")
Exemple #25
0
def per_doc_score():
    filename = "fetch_hidden_dim.pickle"
    html_writer = HtmlVisualizer("preserved.html", dark_mode=False)

    p = os.path.join(output_path, filename)
    raw_data = pickle.load(open(p, "rb"))


    n_skip = 0
    data = EstimatorPredictionViewerGosford(filename)
    for inst_i, entry in enumerate(data):
        if inst_i > 100:
            break
        count_preserved = entry.get_vector("layer_count")
        tokens = entry.get_tokens("input_ids")
        cells = data.cells_from_tokens(tokens)
        valid_parst = count_preserved[:len(cells)]
        avg = np.average(count_preserved)
        row = []
        row2 = []
        #f_print = avg > 20
        f_print = True
        print(avg)
        if f_print:
            html_writer.write_paragraph("Skipped {} articles".format(n_skip))
            n_skip = 0
            for idx, cell in enumerate(cells):
                score = count_preserved[idx] / 728 * 100
                cell.highlight_score = score
                row.append(cell)
                row2.append((Cell(count_preserved[idx], score)))
                if len(row) == 20:
                    html_writer.write_table([row, row2])
                    row = []
                    row2 = []

            html_writer.write_paragraph(str(avg))
        else:
            n_skip += 1
Exemple #26
0
def show(filename):
    data = EstimatorPredictionViewerGosford(filename)
    html_writer = HtmlVisualizer("token_scoring.html", dark_mode=False)

    correctness = []
    for entry in data:
        tokens = entry.get_tokens("input_ids")
        logits = entry.get_vector("logits")
        masks = entry.get_vector("label_masks")
        ids = entry.get_vector("labels")



        token_row = []
        pred_row = []
        gold_row = []
        rows = [token_row, pred_row, gold_row]

        for idx, token in enumerate(tokens):
            token_cell = Cell(token)
            if token == "[PAD]":
                break
            model_score = logits[idx][0]   
            if masks[idx]:

                correct = (model_score > 0 and ids[idx] > 0) or (model_score < 0 and ids[idx] < 0)
                color = "B" if correct else "R"
                if correct and (model_score > 0 and ids[idx] > 0) :
                    color = "G"
                pred_cell = Cell("{0:.2f}".format(model_score), 100, target_color=color)
                gold_cell = Cell("{0:.2f}".format(ids[idx]), 100, target_color=color)
            else:
                token_cell = Cell(token)
                pred_cell = Cell("")
                gold_cell = Cell("")

            token_row.append(token_cell)
            pred_row.append(pred_cell)
            gold_row.append(gold_cell)

        html_writer.multirow_print_from_cells_list(rows, 20)
 def get_cell(token) -> Cell:
     if token.lower() in pc_tokens_set:
         score = 100
     else:
         score = 0
     return Cell(token, score)
Exemple #28
0
def main():
    n_factor = 16
    step_size = 16
    max_seq_length = 128
    max_seq_length2 = 128 - 16
    batch_size = 8
    info_file_path = at_output_dir("robust", "seg_info")
    queries = load_robust_04_query("desc")
    qid_list = get_robust_qid_list()

    f_handler = get_format_handler("qc")
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    print(len(info))
    tokenizer = get_tokenizer()

    for job_idx in [1]:
        qid = qid_list[job_idx]
        query = queries[str(qid)]
        q_term_length = len(tokenizer.tokenize(query))
        data_path1 = os.path.join(output_path, "robust",
                                  "windowed_{}.score".format(job_idx))
        data_path2 = os.path.join(output_path, "robust",
                                  "windowed_small_{}.score".format(job_idx))
        data1 = OutputViewer(data_path1, n_factor, batch_size)
        data2 = OutputViewer(data_path2, n_factor, batch_size)
        segment_len = max_seq_length - 3 - q_term_length
        segment_len2 = max_seq_length2 - 3 - q_term_length

        outputs = []
        for d1, d2 in zip(data1, data2):
            # for each query, doc pairs
            cur_info1 = info[d1['data_id']]
            cur_info2 = info[d2['data_id']]
            query_doc_id1 = f_handler.get_pair_id(cur_info1)
            query_doc_id2 = f_handler.get_pair_id(cur_info2)

            assert query_doc_id1 == query_doc_id2

            doc = d1['doc']
            probs = get_probs(d1['logits'])
            probs2 = get_probs(d2['logits'])
            n_pred_true = np.count_nonzero(np.less(0.5, probs))
            print(n_pred_true, len(probs))

            seg_scores: List[Tuple[int, int, float]] = get_piece_scores(
                n_factor, probs, segment_len, step_size)
            seg_scores2: List[Tuple[int, int, float]] = get_piece_scores(
                n_factor, probs2, segment_len2, step_size)
            ss_list = []
            for st, ed, score in seg_scores:
                try:
                    st2, ed2, score2 = find_where(lambda x: x[1] == ed,
                                                  seg_scores2)
                    assert ed == ed2
                    assert st < st2
                    tokens = tokenizer.convert_ids_to_tokens(doc[st:st2])
                    diff = score - score2
                    ss = ScoredPiece(st, st2, diff, tokens)
                    ss_list.append(ss)
                except StopIteration:
                    pass
            outputs.append((probs, probs2, query_doc_id1, ss_list))

        html = HtmlVisualizer("windowed.html")

        for probs, probs2, query_doc_id, ss_list in outputs:
            html.write_paragraph(str(query_doc_id))
            html.write_paragraph("Query: " + query)

            ss_list.sort(key=lambda ss: ss.st)
            prev_end = None
            cells = []
            prob_str1 = lmap(two_digit_float, probs)
            prob_str1 = ["8.88"] + prob_str1
            prob_str2 = lmap(two_digit_float, probs2)
            html.write_paragraph(" ".join(prob_str1))
            html.write_paragraph(" ".join(prob_str2))

            for ss in ss_list:
                if prev_end is not None:
                    assert prev_end == ss.st
                else:
                    print(ss.st)

                score = abs(int(100 * ss.score))
                color = "B" if score > 0 else "R"
                cells.extend(
                    [Cell(t, score, target_color=color) for t in ss.tokens])
                prev_end = ss.ed

            html.multirow_print(cells)
Exemple #29
0
 def make_cell(subword: Subword):
     if subword in highlight_terms:
         return Cell(subword, highlight_score=100)
     else:
         return Cell(subword)
def draw(in_file, out_file):

    filename = os.path.join(output_path, in_file)
    data = EstimatorPredictionViewerGosford(filename)
    amp = 10
    html_writer = HtmlVisualizer(out_file, dark_mode=False)

    tokenizer = get_tokenizer()
    for inst_i, entry in enumerate(data):
        if inst_i > 100:
            break

        input_ids = entry.get_vector("input_ids")
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        #tokens = entry.get_tokens("input_ids")
        prob1 = entry.get_vector("prob1")
        prob2 = entry.get_vector("prob2")

        proximity = (1 - (prob1 - prob2))
        difficulty = np.power(1 - prob1, 0.3)
        scores = proximity * difficulty

        prob_scores = probabilty(scores, amp)
        prob_strs = ["{:06.6f}".format(v * 1000) for v in prob_scores]

        def normalize(prob):
            # 0-> Good
            # -1 -> Bad
            return prob * 1000 * 25

        norm_scores = lmap(normalize, prob_scores)

        cells = data.cells_from_tokens(tokens, norm_scores, stop_at_pad=False)
        cells2 = data.cells_from_anything(prob1, norm_scores)
        cells3 = data.cells_from_anything(prob2, norm_scores)
        cells31 = data.cells_from_anything(difficulty, norm_scores)
        cells4 = data.cells_from_anything(scores, norm_scores)
        cells5 = data.cells_from_anything(prob_strs, norm_scores)

        row1 = [Cell("TEXT:")]
        row2 = [Cell("prob1:")]
        row3 = [Cell("prob2:")]
        row31 = [Cell("difficulty:")]
        row4 = [Cell("priority:")]
        row5 = [Cell("Mask Prob")]

        for idx, cell in enumerate(cells):
            row1.append(cell)
            row2.append(cells2[idx])
            row3.append(cells3[idx])
            row31.append(cells31[idx])
            row4.append(cells4[idx])
            row5.append(cells5[idx])

            if len(row1) == 20:
                html_writer.write_table([row1, row2, row3, row31, row4, row5])

                row1 = [Cell("TEXT:")]
                row2 = [Cell("prob1:")]
                row3 = [Cell("prob2:")]
                row31 = [Cell("difficulty:")]
                row4 = [Cell("priority:")]
                row5 = [Cell("Mask Prob")]