def __init__(self, query_type="desc", neg_k=1000):
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.queries = load_robust_04_query(query_type)
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
Beispiel #2
0
 def __init__(self, encoder, max_seq_length, top_k=100, query_type="title"):
     self.data = self.load_tokens_from_pickles()
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.galago_rank = load_bm25_best()
     self.top_k = top_k
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
Beispiel #3
0
 def __init__(self, encoder, max_seq_length, query_type="title"):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
Beispiel #4
0
 def __init__(self, max_seq_length, use_many_seg_ids=False):
     self.probe_config = Config1()
     self.queries: Dict[str, str] = load_robust_04_query("desc")
     self.tokenizer = get_tokenizer()
     self.max_seq_length = max_seq_length
     qid_list = lmap(str, get_robust_qid_list())
     self.piece_score_parser = PieceScoreParser(self.queries, qid_list,
                                                self.probe_config)
     self.use_many_seg_ids = use_many_seg_ids
Beispiel #5
0
 def __init__(self, doc_max_length, query_type="title", neg_k=1000, pos_only=True):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.doc_max_length = doc_max_length
     self.queries = load_robust_04_query(query_type)
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
     self.pos_only = pos_only
Beispiel #6
0
    def __init__(self, encoder, max_seq_length, query_type,
                 target_selection_fn: Callable[[str, str, List], List[int]]):
        self.data = self.load_tokens()
        qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
        self.judgement = load_qrels_structured(qrel_path)
        self.max_seq_length = max_seq_length
        self.queries = load_robust_04_query(query_type)
        self.encoder = encoder
        self.tokenizer = get_tokenizer()
        self.galago_rank = load_bm25_best()

        self.target_selection_fn: Callable[[str, str, List],
                                           List[int]] = target_selection_fn
Beispiel #7
0
def main():
    query_type = "desc"
    queries = load_robust_04_query(query_type)
    qid_list = get_robust_qid_list()
    tokenizer = get_tokenizer()

    f = open(at_output_dir("robust", "desc_query_len.txt"), "w")
    for qid in qid_list:
        query = queries[str(qid)]
        query_tokens = tokenizer.tokenize(query)
        n_terms = len(query_tokens)
        f.write("{}\n".format(n_terms))
    f.close()
Beispiel #8
0
 def __init__(self,
              encoder,
              max_seq_length,
              score_d,
              query_type="title",
              neg_k=1000):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.score_d: Dict[str, List[float]] = score_d
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.neg_k = neg_k
     self.n_seg_per_doc = 4
Beispiel #9
0
    def __init__(self,
                 encoder,
                 max_seq_length_per_inst,
                 num_doc_per_inst,
                 num_seg_per_inst,
                 query_type="title",
                 neg_k=1000):
        self.data = self.load_tokens()
        qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
        self.judgement = load_qrels_structured(qrel_path)
        self.max_seq_length = max_seq_length_per_inst
        self.queries = load_robust_04_query(query_type)
        self.num_doc_per_inst = num_doc_per_inst
        self.num_seg_per_inst = num_seg_per_inst

        self.all_segment_encoder = encoder
        self.tokenizer = get_tokenizer()
        self.galago_rank = load_bm25_best()
        self.neg_k = neg_k
Beispiel #10
0
 def __init__(self,
              encoder,
              max_seq_length,
              scores,
              query_type="title",
              target_selection="best"):
     self.data = self.load_tokens()
     qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
     self.judgement = load_qrels_structured(qrel_path)
     self.max_seq_length = max_seq_length
     self.queries = load_robust_04_query(query_type)
     self.encoder = encoder
     self.tokenizer = get_tokenizer()
     self.galago_rank = load_bm25_best()
     self.scores: Dict[Tuple[str, str, int], float] = scores
     self.get_target_indices: Callable[[], List[int]] = {
         'best': get_target_indices_get_best,
         'all': get_target_indices_all,
         'first_and_best': get_target_indices_first_and_best,
         'best_or_over_09': get_target_indices_best_or_over_09,
         'random_over_09': get_target_indices_random_over_09
     }[target_selection]
Beispiel #11
0
def main():
    n_factor = 16
    step_size = 16
    max_seq_length = 128
    max_seq_length2 = 128 - 16
    batch_size = 8
    info_file_path = at_output_dir("robust", "seg_info")
    queries = load_robust_04_query("desc")
    qid_list = get_robust_qid_list()

    f_handler = get_format_handler("qc")
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    print(len(info))
    tokenizer = get_tokenizer()

    for job_idx in [1]:
        qid = qid_list[job_idx]
        query = queries[str(qid)]
        q_term_length = len(tokenizer.tokenize(query))
        data_path1 = os.path.join(output_path, "robust",
                                  "windowed_{}.score".format(job_idx))
        data_path2 = os.path.join(output_path, "robust",
                                  "windowed_small_{}.score".format(job_idx))
        data1 = OutputViewer(data_path1, n_factor, batch_size)
        data2 = OutputViewer(data_path2, n_factor, batch_size)
        segment_len = max_seq_length - 3 - q_term_length
        segment_len2 = max_seq_length2 - 3 - q_term_length

        outputs = []
        for d1, d2 in zip(data1, data2):
            # for each query, doc pairs
            cur_info1 = info[d1['data_id']]
            cur_info2 = info[d2['data_id']]
            query_doc_id1 = f_handler.get_pair_id(cur_info1)
            query_doc_id2 = f_handler.get_pair_id(cur_info2)

            assert query_doc_id1 == query_doc_id2

            doc = d1['doc']
            probs = get_probs(d1['logits'])
            probs2 = get_probs(d2['logits'])
            n_pred_true = np.count_nonzero(np.less(0.5, probs))
            print(n_pred_true, len(probs))

            seg_scores: List[Tuple[int, int, float]] = get_piece_scores(
                n_factor, probs, segment_len, step_size)
            seg_scores2: List[Tuple[int, int, float]] = get_piece_scores(
                n_factor, probs2, segment_len2, step_size)
            ss_list = []
            for st, ed, score in seg_scores:
                try:
                    st2, ed2, score2 = find_where(lambda x: x[1] == ed,
                                                  seg_scores2)
                    assert ed == ed2
                    assert st < st2
                    tokens = tokenizer.convert_ids_to_tokens(doc[st:st2])
                    diff = score - score2
                    ss = ScoredPiece(st, st2, diff, tokens)
                    ss_list.append(ss)
                except StopIteration:
                    pass
            outputs.append((probs, probs2, query_doc_id1, ss_list))

        html = HtmlVisualizer("windowed.html")

        for probs, probs2, query_doc_id, ss_list in outputs:
            html.write_paragraph(str(query_doc_id))
            html.write_paragraph("Query: " + query)

            ss_list.sort(key=lambda ss: ss.st)
            prev_end = None
            cells = []
            prob_str1 = lmap(two_digit_float, probs)
            prob_str1 = ["8.88"] + prob_str1
            prob_str2 = lmap(two_digit_float, probs2)
            html.write_paragraph(" ".join(prob_str1))
            html.write_paragraph(" ".join(prob_str2))

            for ss in ss_list:
                if prev_end is not None:
                    assert prev_end == ss.st
                else:
                    print(ss.st)

                score = abs(int(100 * ss.score))
                color = "B" if score > 0 else "R"
                cells.extend(
                    [Cell(t, score, target_color=color) for t in ss.tokens])
                prev_end = ss.ed

            html.multirow_print(cells)
Beispiel #12
0
def main():
    prediction_file_path = at_output_dir("robust", "rob_dense_pred.score")
    info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_info")
    queries: Dict[str, str] = load_robust_04_query("desc")
    tokenizer = get_tokenizer()
    query_token_len_d = {}
    for qid, q_text in queries.items():
        query_token_len_d[qid] = len(tokenizer.tokenize(q_text))
    step_size = 16
    window_size = 128
    out_entries: List[DocTokenScore] = collect_token_scores(
        info_file_path, prediction_file_path, query_token_len_d, step_size,
        window_size)

    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement_d = load_qrels_structured(qrel_path)

    html = HtmlVisualizer("robust_desc_128_step16.html", use_tooltip=True)

    tprint("loading tokens pickles")
    tokens_d: Dict[str, List[str]] = load_pickle_from(
        os.path.join(sydney_working_dir, "RobustPredictTokens3", "1"))
    tprint("Now printing")
    n_printed = 0

    def transform(x):
        return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3))

    for e in out_entries:
        max_score = e.max_segment_score()
        if max_score < 0.6:
            continue
        n_printed += 1
        if n_printed > 10:
            break
        doc_tokens: List[str] = tokens_d[e.doc_id]
        score_len = len(e.scores)
        judgement: Dict[str, int] = judgement_d[e.query_id]
        label = judgement[e.doc_id]

        if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size:
            print("doc length : ", len(doc_tokens))
            print("score len:", score_len)
            print("doc length +step_size: ", len(doc_tokens) + step_size)
            raise IndexError

        row = []
        q_text = queries[e.query_id]
        html.write_paragraph("qid: " + e.query_id)
        html.write_paragraph("q_text: " + q_text)
        html.write_paragraph("Pred: {0:.2f}".format(max_score))
        html.write_paragraph("Label: {0:.2f}".format(label))

        for idx in range(score_len):
            token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]'

            full_scores = e.full_scores[idx]
            full_score_str = " ".join(lmap(two_digit_float, full_scores))
            score = e.scores[idx]
            normalized_score = transform(score) * 200
            c = get_tooltip_cell(token, full_score_str)
            c.highlight_score = normalized_score
            row.append(c)

        html.multirow_print(row, 16)
Beispiel #13
0
def main():
    prediction_file_path = at_output_dir("robust", "rob_dense2_pred.score")
    info_file_path = at_job_man_dir1("robust_predict_desc_128_step16_2_info")
    queries: Dict[str, str] = load_robust_04_query("desc")
    tokenizer = get_tokenizer()
    query_token_len_d = {}
    for qid, q_text in queries.items():
        query_token_len_d[qid] = len(tokenizer.tokenize(q_text))
    step_size = 16
    window_size = 128
    out_entries: List[AnalyzedDoc] = token_score_by_ablation(
        info_file_path, prediction_file_path, query_token_len_d, step_size,
        window_size)

    qrel_path = "/home/youngwookim/Downloads/rob04-desc/qrels.rob04.txt"
    judgement_d = load_qrels_structured(qrel_path)

    html = HtmlVisualizer("robust_desc_128_step16_2.html", use_tooltip=True)

    tprint("loading tokens pickles")
    tokens_d: Dict[str, List[str]] = load_pickle_from(
        os.path.join(sydney_working_dir, "RobustPredictTokens3", "1"))
    tprint("Now printing")
    n_printed = 0

    def transform(x):
        return 3 * (math.pow(x - 0.5, 3) + math.pow(0.5, 3))

    n_pos = 0
    n_neg = 0
    for e in out_entries:
        max_score: float = max(
            lmap(SegmentScorePair.get_max_score,
                 flatten(e.token_info.values())))
        if max_score < 0.6:
            if n_neg > n_pos:
                continue
            else:
                n_neg += 1
                pass
        else:
            n_pos += 1

        n_printed += 1
        if n_printed > 500:
            break

        doc_tokens: List[str] = tokens_d[e.doc_id]
        score_len = max(e.token_info.keys()) + 1
        judgement: Dict[str, int] = judgement_d[e.query_id]
        label = judgement[e.doc_id]

        if not len(doc_tokens) <= score_len < len(doc_tokens) + window_size:
            print("doc length : ", len(doc_tokens))
            print("score len:", score_len)
            print("doc length +step_size: ", len(doc_tokens) + step_size)
            continue

        row = []
        q_text = queries[e.query_id]
        html.write_paragraph("qid: " + e.query_id)
        html.write_paragraph("q_text: " + q_text)
        html.write_paragraph("Pred: {0:.2f}".format(max_score))
        html.write_paragraph("Label: {0:.2f}".format(label))

        for idx in range(score_len):
            token = doc_tokens[idx] if idx < len(doc_tokens) else '[-]'
            token_info: List[SegmentScorePair] = e.token_info[idx]
            full_scores: List[float] = lmap(SegmentScorePair.get_score_diff,
                                            token_info)

            full_score_str = " ".join(lmap(two_digit_float, full_scores))
            # 1 ~ -1
            score = average(full_scores)
            if score > 0:
                color = "B"
            else:
                color = "R"
            normalized_score = transform(abs(score)) * 200
            c = get_tooltip_cell(token, full_score_str)
            c.highlight_score = normalized_score
            c.target_color = color
            row.append(c)

        html.multirow_print(row, 16)