Beispiel #1
0
def dev():
    train_data_feeder = load_cache("train_data_feeder")
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    html_writer = HtmlVisualizer("nli_w_dict.html", dark_mode=False)

    for _ in range(100):
        batch = train_data_feeder.get_random_batch(1)

        input_ids, input_mask, segment_ids, d_input_ids, d_input_mask, d_location_ids, y = batch

        tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

        for i in range(len(tokens)):
            if i is not 0 and i in d_location_ids:
                tokens[i] = "<b>{}</b>".format(tokens[i])
            if tokens[i] == "[unused3]":
                tokens[i] = "[SEP]\n"

        s = tokenizer_wo_tf.pretty_tokens(tokens)
        html_writer.write_headline("Input")
        html_writer.write_paragraph(s)

        d_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[0])
        for i in range(len(d_tokens)):
            if tokens[i] == "[unused5]":
                tokens[i] = "<br>\n"

        s = tokenizer_wo_tf.pretty_tokens(d_tokens)
        html_writer.write_headline("Dict def")
        html_writer.write_paragraph(s)

    html_writer.close()
def print_paragraph_feature(pf_list: List[ParagraphFeature],
                            out_path: FilePath):
    html = HtmlVisualizer(out_path)
    for pf in pf_list:
        html.write_paragraph("Text 1: " + pf.datapoint.text1)
        html.write_paragraph("Text 2: " + pf.datapoint.text2)
        for f in pf.feature:
            s = " ".join(f.paragraph.tokens)
            html.write_paragraph(s)

    html.close()
Beispiel #3
0
def main():
    # load queires and candidate (from qrel? from BM25 ?)

    # write html
    #   1. Query
    #   2. Doc ID
    #   3. Snippet with most keyword match (BM25 score)
    #   4. scrollable component
    ranked_list_path = os.path.join(output_path, "ranked_list",
                                    "robust_V_10K_10000.txt")
    bert_ranked_list = load_ranked_list_grouped(ranked_list_path)

    queries: Dict[str, str] = load_robust04_desc2()
    qck_queries = to_qck_queries(queries)
    qrels = load_robust04_qrels()

    candidates_d = load_candidate_d()
    # save_to_pickle(candidates_d, "candidate_viewer_candidate_d")
    # candidates_d = load_from_pickle("candidate_viewer_candidate_d")
    style = [get_collapsible_css(), get_scroll_css()]
    #
    html = HtmlVisualizer(
        "robust_V_predictions.html",
        additional_styles=style,
    )

    def is_perfect(judgement, ranked_list):
        label_list = get_labels(judgement, ranked_list)

        all_relevant = True
        for l in label_list:
            if not l:
                all_relevant = False
            if l:
                if not all_relevant:
                    return False
        return True

    def get_labels(judgement, ranked_list):
        label_list = []
        for e in ranked_list:
            doc_id = e.doc_id
            if doc_id in judgement:
                label = judgement[doc_id]
            else:
                label = 0
            label_list.append(label)
        return label_list

    def p_at_k(judgement, ranked_list, k=10):
        label_list = get_labels(judgement, ranked_list)
        num_correct = sum([1 if label else 0 for label in label_list[:k]])
        return num_correct / k

    for qid in bert_ranked_list:
        if qid in candidates_d:
            if qid not in qrels:
                continue
            judgement = qrels[qid]
            q_text = queries[qid]
            ranked_list = bert_ranked_list[qid]
            if is_perfect(judgement, ranked_list):
                continue

            html.write_div_open()
            text = "{0}: {1} ({2:.2f})".format(qid, q_text,
                                               p_at_k(judgement, ranked_list))
            html.write_elem(
                "button",
                text,
                "collapsible",
            )
            html.write_div_open("content")
            doc_text_d = dict(candidates_d[qid])

            for e in ranked_list:
                #tokens = doc_tokens[e.doc_id]
                doc_id = e.doc_id
                if doc_id in judgement:
                    label = judgement[doc_id]
                else:
                    label = 0

                style = "font-size: 13px; padding: 8px;"
                if label:
                    style += " background-color: DarkGreen"
                else:
                    style += " background-color: DarkRed"
                text = "{0}] {1} ({2:.2f})".format(e.rank, doc_id, e.score)
                html.write_elem("p", text, "collapsible", style)
                #text = pretty_tokens(tokens, True)
                doc_text = doc_text_d[doc_id]
                html.write_div(doc_text, "c_content")
            html.write_div_close()
            html.write_div_close()
    html.write_script(get_collapsible_script())
    html.close()
Beispiel #4
0
def analyze_gradient(data, tokenizer):
    gradients = data['gradients']
    d_input_ids = data['d_input_ids']
    mask_input_ids = data['masked_input_ids']
    masked_lm_positions = data["masked_lm_positions"]

    n_inst, seq_len = mask_input_ids.shape
    n_inst2, def_len = d_input_ids.shape

    assert n_inst == n_inst2

    def_len = 256
    hidden_dim = 768
    reshaped_grad = reshape_gradienet(gradients, n_inst, def_len, hidden_dim)
    print(reshaped_grad.shape)

    n_pred = reshaped_grad.shape[1]

    grad_per_token = np.sum(np.abs(reshaped_grad), axis=3)

    html_writer = HtmlVisualizer("dict_grad.html", dark_mode=False)

    for inst_idx in range(n_inst):
        tokens = tokenizer.convert_ids_to_tokens(mask_input_ids[inst_idx])
        #ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])
        for i in range(len(tokens)):
            if tokens[i] == "[MASK]":
                tokens[i] = "[MASK_{}]".format(i)
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"
        def_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[inst_idx])
        s = tokenizer_wo_tf.pretty_tokens(tokens)

        lines = []

        grad_total_max = 0
        for pred_idx in range(n_pred):
            row = []
            max_val = max(grad_per_token[inst_idx, pred_idx])
            total = sum(grad_per_token[inst_idx, pred_idx])
            mask_pos = masked_lm_positions[inst_idx, pred_idx]

            if total > grad_total_max:
                grad_total_max = total

            row.append(Cell(mask_pos))
            row.append(Cell(int(total)))

            for def_idx in range(def_len):
                term = def_tokens[def_idx]
                cont_right = def_idx + 1 < def_len and def_tokens[
                    def_idx][:2] == "##"
                cont_left = term[:2] == "##"

                space_left = "&nbsp;" if not cont_left else ""
                space_right = "&nbsp;" if not cont_right else ""

                if term == "[PAD]":
                    break
                if term == "[unused5]":
                    term = "[\\n]"

                score = grad_per_token[inst_idx, pred_idx,
                                       def_idx] / (hidden_dim * 2)
                bg_color = get_color(score)

                row.append(Cell(term, score, not cont_left, not cont_right))
                print("{}({})".format(
                    term, grad_per_token[inst_idx, pred_idx, def_idx]),
                      end=" ")

            lines.append((mask_pos, row))
            print("")
        lines.sort(key=lambda x: x[0])

        s = s.replace("[unused4]", "<b>DictTerm</b>")
        html_writer.write_paragraph(s)

        if grad_total_max > 5000000:
            html_writer.write_headline("HIGH Gradient")

        rows = right(lines)
        html_writer.write_table(rows)

        print("----------")
    html_writer.close()
def load_and_visualize():
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    data_id = "1"

    n_list = open(os.path.join(output_path, "lookup_n", data_id),
                  "r").readlines()
    p = os.path.join(output_path, "example_loss.pickle")
    data = pickle.load(open(p, "rb"))
    data = data[0]["masked_lm_example_loss"]

    feature_itr = load_record_v1(
        os.path.join(output_path, "lookup_example", data_id))

    n = len(n_list)
    feature_idx = 0
    html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False)

    for i in range(n):
        n_sample = int(n_list[i])
        rows = []
        assert n_sample > 0
        for j in range(n_sample):
            feature = feature_itr.__next__()

            input_ids = take(feature["input_ids"])
            masked_lm_ids = take(feature["masked_lm_ids"])
            masked_lm_positions = take(feature["masked_lm_positions"])
            input_mask = take(feature["input_mask"])
            selected_word = take(feature["selected_word"])
            d_input_ids = take(feature["d_input_ids"])
            d_location_ids = take(feature["d_location_ids"])

            word_tokens = tokenizer.convert_ids_to_tokens(selected_word)
            word = tokenizer_wo_tf.pretty_tokens((word_tokens))

            emph_word = "<b>" + word + "</b>"

            if j == 0:
                mask_ans = {}
                masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids)
                for pos, id in zip(list(masked_lm_positions), masked_terms):
                    mask_ans[pos] = id

                tokens = tokenizer.convert_ids_to_tokens(input_ids)

            for i in range(len(tokens)):
                if tokens[i] == "[MASK]":
                    tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i])
                if i in d_location_ids and i is not 0:
                    if tokens[i - 1] != emph_word:
                        tokens[i] = emph_word
                    else:
                        tokens[i] = "-"

            def_str = tokenizer_wo_tf.pretty_tokens(
                tokenizer.convert_ids_to_tokens(d_input_ids), True)
            row = list()
            row.append(Cell(word))
            row.append(Cell(data[feature_idx]))
            row.append(Cell(def_str))
            rows.append(row)

            feature_idx += 1

        s = tokenizer_wo_tf.pretty_tokens(tokens, True)
        html_writer.write_paragraph(s)

        html_writer.write_table(rows)

    html_writer.close()
Beispiel #6
0
def diff_view():
    tokenizer = get_tokenizer()
    filename = "bert_815.pickle"
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))
    filename = "bfn_3_200_815.pickle"
    p = os.path.join(output_path, filename)
    data2 = pickle.load(open(p, "rb"))

    run_name = "diff"

    batch_size, seq_length = data[0]['masked_input_ids'].shape
    masked_input_ids = []
    input_ids = []
    masked_lm_example_loss = []

    masked_lm_positions = []
    masked_lm_ids = []
    for e in data[:-1]:
        masked_input_ids.append(e["masked_input_ids"])
        input_ids.append(e["input_ids"])
        masked_lm_example_loss.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))
        masked_lm_positions.append(e["masked_lm_positions"])
        masked_lm_ids.append(e["masked_lm_ids"])

    masked_lm_example_loss2 = []
    for e in data2[:-1]:
        masked_lm_example_loss2.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))

    masked_lm_example_loss2 = np.concatenate(masked_lm_example_loss2)

    input_ids = np.concatenate(input_ids)
    masked_input_ids = np.concatenate(masked_input_ids)
    masked_lm_example_loss = np.concatenate(masked_lm_example_loss)
    masked_lm_positions = np.concatenate(masked_lm_positions)
    masked_lm_ids = np.concatenate(masked_lm_ids)

    html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False)
    n_instance = len(input_ids)
    for inst_idx in range(n_instance):

        tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx])
        ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])

        ans_keys = dict(
            zip(masked_lm_positions[inst_idx],
                tokenizer.convert_ids_to_tokens(masked_lm_ids[inst_idx])))

        loss_at_loc = {
            p: l
            for l, p in zip(masked_lm_example_loss[inst_idx],
                            masked_lm_positions[inst_idx])
        }
        loss_at_loc2 = {
            p: l
            for l, p in zip(masked_lm_example_loss2[inst_idx],
                            masked_lm_positions[inst_idx])
        }

        score_at_loc = {k: math.exp(-v) for k, v in loss_at_loc.items()}
        score_at_loc2 = {k: math.exp(-v) for k, v in loss_at_loc2.items()}

        def is_dependent(token):
            return len(token) == 1 and not token[0].isalnum()

        cells = []
        for i in range(len(tokens)):
            f_inverse = False
            score = 0
            if tokens[i] == "[MASK]" or i in loss_at_loc:
                tokens[i] = "[{}-{}]".format(i, ans_keys[i])
                score = (score_at_loc2[i] - score_at_loc[i]) * 180
                score = -score
                if score < 0:
                    f_inverse = True
                    score = abs(score)
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"

            if tokens[i] != "[PAD]":
                term = tokens[i]
                cont_left = term[:2] == "##"
                cont_right = i + 1 < len(tokens) and tokens[i + 1][:2] == "##"
                if i + 1 < len(tokens):
                    dependent_right = is_dependent(tokens[i + 1])
                else:
                    dependent_right = False

                dependent_left = is_dependent(tokens[i])

                if cont_left:
                    term = term[2:]

                space_left = "&nbsp;" if not (cont_left
                                              or dependent_left) else ""
                space_right = "&nbsp;" if not (cont_right
                                               or dependent_right) else ""

                if not f_inverse:
                    cells.append(Cell(term, score, space_left, space_right))
                else:
                    cells.append(
                        Cell(term,
                             score,
                             space_left,
                             space_right,
                             target_color="R"))
        #s = tokenization.pretty_tokens(tokens)

        rows = []
        row = []
        for cell in cells:
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        loss_infos = []
        for loss, pos in zip(masked_lm_example_loss[inst_idx],
                             masked_lm_positions[inst_idx]):
            loss_infos.append((loss, pos))

        loss_infos.sort(key=lambda x: x[1])

        rows = []
        for loss, pos in loss_infos:
            loss1 = score_at_loc[pos]
            loss2 = score_at_loc2[pos]
            loss_diff = loss1 - loss2
            rows.append((Cell(pos), Cell(loss1), Cell(loss2), Cell(loss_diff)))

        html_writer.write_table(rows)

    html_writer.close()
Beispiel #7
0
def work():
    tokenizer = get_tokenizer()
    filename = "bert_815.pickle"
    filename = "bfn_3_200_815.pickle"
    run_name = filename[:-(len(".pickle"))]
    p = os.path.join(output_path, filename)
    data = pickle.load(open(p, "rb"))

    batch_size, seq_length = data[0]['masked_input_ids'].shape
    masked_input_ids = []
    input_ids = []
    masked_lm_example_loss = []
    masked_lm_positions = []
    for e in data[:-1]:
        masked_input_ids.append(e["masked_input_ids"])
        input_ids.append(e["input_ids"])
        masked_lm_example_loss.append(
            np.reshape(e["masked_lm_example_loss"], [batch_size, -1]))
        masked_lm_positions.append(e["masked_lm_positions"])

    input_ids = np.concatenate(input_ids)
    masked_input_ids = np.concatenate(masked_input_ids)
    masked_lm_example_loss = np.concatenate(masked_lm_example_loss)
    masked_lm_positions = np.concatenate(masked_lm_positions)

    html_writer = HtmlVisualizer(run_name + ".html", dark_mode=False)
    n_instance = len(input_ids)
    for inst_idx in range(200):

        tokens = tokenizer.convert_ids_to_tokens(masked_input_ids[inst_idx])
        ans_tokens = tokenizer.convert_ids_to_tokens(input_ids[inst_idx])

        loss_at_loc = {
            p: l
            for l, p in zip(masked_lm_example_loss[inst_idx],
                            masked_lm_positions[inst_idx])
        }

        cells = []
        for i in range(len(tokens)):
            score = 0
            if tokens[i] == "[MASK]":
                tokens[i] = "[{}]".format(ans_tokens[i])
                score = loss_at_loc[i] * 255 / 25
            if tokens[i] == "[SEP]":
                tokens[i] = "[SEP]<br>"

            if tokens[i] != "[PAD]":
                cells.append(Cell(tokens[i], score))
        #s = tokenization.pretty_tokens(tokens)

        rows = []
        row = []
        for cell in cells:
            row.append(cell)
            if len(row) == 20:
                html_writer.write_table([row])
                row = []

        loss_infos = []
        for loss, pos in zip(masked_lm_example_loss[inst_idx],
                             masked_lm_positions[inst_idx]):
            loss_infos.append((loss, pos))

        loss_infos.sort(key=lambda x: x[1])

        rows = []
        for loss, pos in loss_infos:
            rows.append((Cell(pos), Cell(loss)))

        html_writer.write_table(rows)

    html_writer.close()
Beispiel #8
0
def main(config):
    # load queires and candidate (from qrel? from BM25 ?)

    # write html
    #   1. Query
    #   2. Doc ID
    #   3. Snippet with most keyword match (BM25 score)
    #   4. scrollable component

    score_d = load_qk_score_as_dict(config)
    # qk_candidate: List[QKUnit] = load_from_pickle("robust_on_clueweb_qk_candidate")
    qk_candidate: List[QKUnit] = load_from_pickle(
        "robust_on_clueweb_qk_candidate_filtered")
    # qk_candidate: List[QKUnit] = load_from_pickle("robust_on_wiki_qk_candidate")

    # candidates_d = load_candidate_d()
    # save_to_pickle(candidates_d, "candidate_viewer_candidate_d")
    style = [get_collapsible_css(), get_scroll_css()]
    #
    html = HtmlVisualizer(
        "robust_k_docs_filtered.html",
        additional_styles=style,
    )

    for query, k_list in qk_candidate:
        qid = query.query_id
        q_text = query.text
        if not k_list:
            continue

        c = Counter()
        for k in k_list:
            kdp_id = "{}-{}".format(k.doc_id, k.passage_idx)
            score = score_d[qid, kdp_id]
            label = 1 if score > 0.5 else 0
            c[label] += 1

        pos_rate = (c[1] / (c[1] + c[0]))

        html.write_div_open()
        html.write_elem(
            "button",
            "{0}: {1} ({2:.2f})".format(qid, q_text, pos_rate),
            "collapsible",
        )
        html.write_div_open("content")
        for k in k_list:
            kdp_id = "{}-{}".format(k.doc_id, k.passage_idx)
            score = score_d[qid, kdp_id]
            label = score > 0.5
            text = " ".join(k.tokens)
            style = "font-size: 13px; padding: 8px;"
            if label:
                style += " background-color: DarkGreen"
            else:
                style += " background-color: DarkRed"
            html.write_elem("p", "{0} : {1:.2f}".format(kdp_id, score),
                            "collapsible", style)
            html.write_div(text, "c_content")
        html.write_div_close()
        html.write_div_close()
    html.write_script(get_collapsible_script())
    html.close()