コード例 #1
0
def extract_qk_unit(info_path, pred_path, config_path) -> Iterable[QKUnit]:
    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions = join_prediction_with_info(pred_path, info)
    grouped: Dict[str, List[Dict]] = group_by(predictions,
                                              lambda x: x['query'].query_id)
    config = json.load(open(config_path, "r"))
    score_cut = config['score_cut']
    top_k = config['top_k']

    def is_good(entry):
        return get_regression_score(entry) > score_cut

    select_rate_list = []
    qk_units = []
    for qid, entries in grouped.items():
        any_entry = entries[0]
        query = any_entry['query']
        good_entries = lfilter(is_good, entries)
        good_entries.sort(key=get_regression_score, reverse=True)
        selected_entries = good_entries[:top_k]
        if not selected_entries:
            continue
        kd_list = lmap(lambda x: x['kdp'], selected_entries)
        qk_units.append((query, kd_list))

        select_rate = len(selected_entries) / len(entries)
        select_rate_list.append(select_rate)

    print("{} of {} qk units selected".format(len(qk_units), len(grouped)))
    print("average select rate", average(select_rate_list))
    return qk_units
コード例 #2
0
def summarize_score(info: Dict, prediction_file_path: str,
                    f_handler: FormatHandler,
                    score_type) -> Iterable[TrecRankedListEntry]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    print("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        query_id, doc_id = pair_id
        out_d[pair_id] = scores
        for score in scores:
            yield TrecRankedListEntry(query_id, doc_id, 0, score, "")
コード例 #3
0
ファイル: best_seg_debug.py プロジェクト: clover3/Chair
def main():
    info_dir = os.path.join(job_man_dir, "best_seg_prediction_gen_train_info")
    job_id = 0
    info_file_path = os.path.join(info_dir, str(job_id) + ".info")
    print(info_file_path)
    info = json.load(open(info_file_path, "r"))
    prediction_dir = "output/mmd_ss/mmd_Z_50000"
    prediction_file = os.path.join(prediction_dir, str(job_id) + ".score")
    pred_data: List[Dict] = join_prediction_with_info(prediction_file, info)

    target_qdid = ("1000633", "D144400")
    saved_entries = []
    for key, entry in info.items():
        if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400':
            saved_entries.append(entry)
            print(entry)

    print('--')
    for entry in pred_data:
        if entry['qid'] == "1000633" and entry['doc_id'] == 'D144400':
            print(entry)

    qid = "1000633"
    sr_path = os.path.join(job_man_dir, "seg_resource_train", qid)
    sr_per_query: SRPerQuery = load_pickle_from(sr_path)

    for sr_per_query_doc in sr_per_query.sr_per_query_doc:
        if sr_per_query_doc.doc_id == "D144400":
            print("doc {} has {} segs".format(sr_per_query_doc.doc_id,
                                              len(sr_per_query_doc.segs)))
コード例 #4
0
def summarize_score_wo_merge(info: Dict, prediction_file_path: str,
                             f_handler: FormatHandler,
                             score_type) -> Dict[Tuple[str, str], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    tprint("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        query_id, doc_id = pair_id
        scores = lmap(get_score, items)
        for idx, score in enumerate(scores):
            new_doc_id = "{}_{}".format(doc_id, idx)
            out_d[(query_id, new_doc_id)] = score

    return out_d
コード例 #5
0
def summarize_score(info: Dict, prediction_file_path: str,
                    f_handler: FormatHandler, combine_score: Callable,
                    score_type) -> Dict[Tuple[str, str], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    tprint("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        final_score = combine_score(scores)
        out_d[pair_id] = final_score

    num_items_per_group = average(lmap(len, grouped.values()))
    tprint("Num items per group : ", num_items_per_group)
    return out_d
コード例 #6
0
ファイル: score_summarizer.py プロジェクト: clover3/Chair
def load_baseline_score_d(baseline_info_file_path, pred_path, is_info_from_pickle) -> Dict[Tuple[str, str], float] :
    info = load_combine_info_jsons(baseline_info_file_path, qck_convert_map)
    predictions: List[Dict] = join_prediction_with_info(pred_path, info, ["logits"], is_info_from_pickle)
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions)
    baseline_d: Dict[Tuple[str, str], float] = {}
    for e in out_entries:
        key = e.query.query_id, e.candidate.id
        score = logit_to_score_softmax(e.logits)
        baseline_d[key] = score
    return baseline_d
コード例 #7
0
ファイル: k_doc_viewer.py プロジェクト: clover3/Chair
def load_qk_score(config) -> List[QKOutEntry]:
    info_path = config['info_path']
    passage_score_path = config['pred_path']
    score_type = config['score_type']
    fetch_field_list = ["logits", "input_ids", "data_id"]
    data_id_to_info: Dict = load_combine_info_jsons(info_path, qk_convert_map)
    data: List[Dict] = join_prediction_with_info(passage_score_path,
                                                 data_id_to_info,
                                                 fetch_field_list)
    qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict2, data)
    return qk_out_entries
コード例 #8
0
ファイル: calculate_doc_score.py プロジェクト: clover3/Chair
def calculate_score(info,
                    pred_path,
                    baseline_score: Dict[Tuple[str, str], float],
                    str_data_id=False) -> List[DocValueParts]:

    predictions: List[Dict] = join_prediction_with_info(
        pred_path, info, ["logits"], str_data_id)
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions)
    labels: Dict[str, List[str]] = load_labels()
    doc_score_parts: List[DocValueParts] = get_doc_value_parts(
        out_entries, baseline_score, labels)
    return doc_score_parts
コード例 #9
0
def show_high():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    # prediction_file = at_output_dir("clue_counter_arg", "ada_aawd4_clue.4000.score")
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        if int(score * 100) == 13:
            print(e['text'])
コード例 #10
0
def summarize_score(info: Dict, prediction_file_path: str,
                    score_type) -> Dict[Tuple[str, str, int], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    score_d: Dict[Tuple[str, str, int], float] = {}
    for entry in data:
        score = get_score_from_logit(score_type, entry['logits'])
        key = entry['query_id'], entry['doc_id'], entry['passage_idx']
        score_d[key] = score
    return score_d
コード例 #11
0
def doc_score_predictions():
    passage_score_path = "output/cppnc/qknc_val"
    info = load_combine_info_jsons("output/cppnc/qknc_val.info", qk_convert_map)
    data = join_prediction_with_info(passage_score_path, info)
    grouped: Dict[str, List[Dict]] = group_by(data, lambda x: x['query'].query_id)

    def get_score_from_logit(logits):
        return scipy.special.softmax(logits)[1]

    for cid, passages in grouped.items():
        scores: List[float] = lmap(lambda d: get_score_from_logit(d['logits']), passages)
        yield cid, scores
コード例 #12
0
def main(config):
    info_dir = config['info_path']
    prediction_file = config['pred_path']

    f_handler = get_format_handler("qck")
    info = load_combine_info_jsons(info_dir, f_handler.get_mapping(),
                                   f_handler.drop_kdp())
    data: List[Dict] = join_prediction_with_info(prediction_file, info,
                                                 ["data_id", "logits"])
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, data)
    qrel: Dict[str, Dict[str,
                         int]] = load_qrels_structured(config['qrel_path'])

    def get_label(query_id, candi_id):
        if candi_id in qrel[query_id]:
            return qrel[query_id][candi_id]
        else:
            return 0

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    grouped: Dict[str,
                  List[QCKOutEntry]] = group_by(out_entries,
                                                lambda x: x.query.query_id)
    for query_id, items in grouped.items():
        raw_kdp_list = [(x.kdp.doc_id, x.kdp.passage_idx) for x in items]
        kdp_list = unique_list(raw_kdp_list)

        raw_candi_id_list = [x.candidate.id for x in items]
        candi_id_list = unique_list(raw_candi_id_list)

        logit_d = {(x.candidate.id, (x.kdp.doc_id, x.kdp.passage_idx)):
                   x.logits
                   for x in items}
        labels = [get_label(query_id, candi_id) for candi_id in candi_id_list]
        head_row0 = [" "] + labels
        head_row1 = [" "] + candi_id_list
        rows = [head_row0, head_row1]
        for kdp_sig in kdp_list:
            row = [kdp_sig]
            for candi_id in candi_id_list:
                try:
                    score = logit_to_score_softmax(logit_d[candi_id, kdp_sig])
                    score_str = "{0:.2f}".format(score)
                except KeyError:
                    score_str = "-"
                row.append(score_str)
            rows.append(row)

        print(query_id)
        print_table(rows)
コード例 #13
0
def main(config):
    info_path = sys.argv[1]
    pred_path = sys.argv[2]

    info = load_combine_info_jsons(info_path, True)
    predictions = join_prediction_with_info(pred_path, info, silent=True)
    out_entries: List[OutEntry] = lmap(OutEntry.from_dict, predictions)
    g = group_by(out_entries, lambda x: x.doc_id)

    for doc_id in g:
        entries: List[OutEntry] = g[doc_id]
        scores = list([logit_to_score_softmax(e.logits) for e in entries])
        print(doc_id, max(scores))
コード例 #14
0
ファイル: read_score_ablation.py プロジェクト: clover3/Chair
def enum_best_segments_always(pred_path, info) -> Iterable[Dict]:
    entries = join_prediction_with_info(pred_path, info)
    grouped = group_by(entries, lambda e: (e['query_id'], e['doc_id']))

    for key in grouped:
        sub_entries = grouped[key]

        def get_score(e):
            return logit_to_score_softmax(e['logits'])

        max_idx = find_max_idx(sub_entries, get_score)

        selected_raw_entry = sub_entries[max_idx]
        yield selected_raw_entry
コード例 #15
0
def main():
    baseline_score: Dict[Tuple[str, str], float] = load_baseline()
    score_save_path = sys.argv[1]
    info_path = sys.argv[2]
    info = load_combine_info_jsons(info_path, qck_convert_map, False)
    # calculate score for each kdp
    predictions: List[Dict] = join_prediction_with_info(
        score_save_path, info, ["logits"], True)
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions)
    labels: Dict[str, List[str]] = load_labels()
    doc_score_parts: List[DocValueParts2] = get_doc_value_parts2(
        out_entries, baseline_score, labels)
    summary_save_path = sys.argv[3]
    pickle.dump(doc_score_parts, open(summary_save_path, "wb"))
コード例 #16
0
def get_f5_tids_score_d_from_bert():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))

    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)
    score_d = {}

    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        text = e['text']
        score_d[text] = score
    return score_d
コード例 #17
0
ファイル: collect_score.py プロジェクト: clover3/Chair
def collect_and_save_score(config):
    info_path = config['info_path']
    pred_path = config['pred_path']
    save_path = config['save_path']

    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions: List[Dict] = join_prediction_with_info(pred_path,
                                                        info,
                                                        ['data_id', 'logits', 'input_ids', 'label_ids'],
                                                        )
    outputs: Iterable[QKTokenLevelOutEntry] = map(QKTokenLevelOutEntry.from_dict, predictions)

    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = group_average_per_query(outputs)
    pickle.dump(per_query_infos, open(save_path, "wb"))
コード例 #18
0
ファイル: selection_analysis.py プロジェクト: clover3/Chair
def main(pred_file_path: str,
         info_file_path: str,
         info_file_path2: str,
         save_name: str,
                        input_type: str,
                        qrel_path: str,
                        ):

    judgement = load_qrels_structured(qrel_path)
    def get_label(key):
        query_id, doc_id = key
        try:
            return judgement[query_id][doc_id]
        except KeyError:
            return 0

    f_handler = get_format_handler(input_type)
    info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp())

    info2: Dict = load_combine_info_jsons(info_file_path2, f_handler.get_mapping(), f_handler.drop_kdp())
    doc_length = get_doc_length_info(info2)
    key_logit = "logits"

    data: List[Dict] = join_prediction_with_info(pred_file_path, info, ["data_id", key_logit])

    grouped = group_by(data, f_handler.get_pair_id)

    cnt = Counter()
    for key, entries in grouped.items():
        if not get_label(key):
            continue
        seg_groups = {}
        for e in entries:
            probs = scipy.special.softmax(e['logits'])[:, 1]
            seg_groups[e['idx']] = probs

        indices = list(seg_groups.keys())
        indices.sort()
        assert max(indices) == len(indices) - 1
        all_probs = []
        for seg_group_idx in seg_groups.keys():
            all_probs.extend(seg_groups[seg_group_idx])

        num_seg = doc_length[key]
        max_idx = np.argmax(all_probs[:num_seg])


        cnt[(max_idx, num_seg)] += 1

    save_to_pickle(cnt, save_name)
コード例 #19
0
ファイル: summarize.py プロジェクト: clover3/Chair
def main(config):
    info = load_combine_info_jsons(config['info_path'])
    predictions: List[Dict] = join_prediction_with_info(config['pred_path'], info,
                                                        ["data_ids", "logits"], True, "data_ids")
    entries: List[OutEntry] = lmap(OutEntry.from_dict, predictions)

    def is_pos(e: OutEntry):
        return logit_to_score_softmax(e.logits) > 0.5
    pos_entries = filter(is_pos, entries)

    rows = []
    for e in pos_entries:
        row = [e.cid, e.pid, e.doc_id, e.sent_idx]
        rows.append(row)
    print_table(rows)
コード例 #20
0
ファイル: token_score.py プロジェクト: clover3/Chair
def load_scores(info_file_path, prediction_file_path):
    input_type = "qc"
    f_handler = get_format_handler(input_type)
    tprint("Loading json info")
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    key_logit = "logits"
    tprint("Reading predictions...")
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])
    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    print("number of groups:", len(grouped))
    return grouped
コード例 #21
0
def extract_qk_unit(info_path, pred_path):
    info = load_combine_info_jsons(info_path, qk_convert_map, False)
    predictions = join_prediction_with_info(pred_path, info)
    grouped: Dict[str, List[Dict]] = group_by(predictions, lambda x: x['query'].query_id)

    rows = []
    for qid, entries in grouped.items():
        any_entry = entries[0]
        query = any_entry['query']
        rows.append([query.query_id, query.text])
        for entry in entries:
            row = [get_regression_score(entry),
                   entry['kdp'].doc_id,
                   entry['kdp'].passage_idx]
            rows.append(row)

    print_table(rows)
コード例 #22
0
def main(config):
    info = load_combine_info_jsons(config['info_path'])
    predictions: List[Dict] = join_prediction_with_info(
        config['pred_path'], info)
    entries = lmap(OutEntry.from_dict, predictions)

    def get_doc_id(e: OutEntry):
        return e.doc_id

    grouped = group_by(entries, get_doc_id)

    for doc_id in grouped:
        doc_entries = grouped[doc_id]
        doc_entries.sort(key=lambda x: x.sent_idx)
        n_pos = 0
        for s in doc_entries:
            if s.logits[1] > 0.5:
                n_pos += 1
        tab_print(doc_id, n_pos, len(doc_entries))
コード例 #23
0
def collect_good_passages(data_id_to_info: Dict[str, Dict],
                          passage_score_path: FilePath,
                          config: Dict
                          ) -> List[Tuple[str, List[QKOutEntry]]]:
    global recover_subtokens
    recover_subtokens = get_recover_subtokens()
    score_cut = config['score_cut']
    top_k = config['top_k']
    score_type = config['score_type']
    fetch_field_list = ["logits", "input_ids", "data_id"]
    data: List[Dict] = join_prediction_with_info(passage_score_path,
                                                 data_id_to_info,
                                                 fetch_field_list
                                                 )
    qk_out_entries: List[QKOutEntry] = lmap(QKOutEntry.from_dict, data)

    grouped: Dict[str, List[QKOutEntry]] = group_by(qk_out_entries, lambda x: x.query.query_id)

    def get_score_from_logit_local(logits) -> float:
        return get_score_from_logit(score_type, logits)

    def get_score(entry: QKOutEntry):
        return get_score_from_logit_local(entry.logits)

    def is_good(qk_out_entry: QKOutEntry):
        score = get_score_from_logit_local(qk_out_entry.logits)
        return score >= score_cut

    output = []
    num_passges = []
    for cid, passages in grouped.items():
        good_passages = lfilter(is_good, passages)
        good_passages.sort(key=get_score, reverse=True)
        num_passges.append(len(good_passages))
        if good_passages:
            output.append((cid, good_passages[:top_k]))
        else:
            scores = lmap(get_score, passages)
            scores.sort(reverse=True)

    print(num_passges)
    print("{} of {} query has passages".format(len(output), len(grouped)))
    return output
コード例 #24
0
def main():
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd4_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    def bin_fn(score):
        return str(int(score * 100))

    bin = BinHistogram(bin_fn)
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        bin.add(score)

    for i in range(101):
        key = str(i)
        if key in bin.counter:
            print(key, bin.counter[key])
コード例 #25
0
    def get_best_seg_info(self, job_id) -> Dict[Tuple[str, str], int]:
        info = json.load(
            open(os.path.join(self.info_dir,
                              str(job_id) + ".info"), "r"))
        prediction_file = os.path.join(self.prediction_dir,
                                       str(job_id) + ".score")
        pred_data: List[Dict] = join_prediction_with_info(
            prediction_file, info)

        def get_score(entry):
            return self.logits_to_score(entry['logits'])

        qdid_grouped = group_by(pred_data, lambda d: (d['qid'], d['doc_id']))
        qdid_to_max_seg_idx: Dict[Tuple[str, str], int] = {}
        for qdi, entries in qdid_grouped.items():
            query_id, doc_id = qdi
            max_seg_idx = entries[find_max_idx(entries, get_score)]['seg_idx']
            qdid_to_max_seg_idx[query_id, doc_id] = max_seg_idx
        return qdid_to_max_seg_idx
コード例 #26
0
def summarize_score(info_dir, prediction_file) -> Dict[Tuple[str, str], float]:
    info = load_combine_info_jsons(info_dir, qckl_convert_map, False)
    print("Info has {} entries".format(len(info)))
    data: List[Dict] = join_prediction_with_info(prediction_file, info,
                                                 ["data_id", "logits"])

    def get_score(entry):
        return entry['logits']

    grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, get_qc_pair_id)
    print("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        assert len(scores) == 1
        final_score = scores[0]
        out_d[pair_id] = final_score

    num_items_per_group = average(lmap(len, grouped.values()))
    print("Num items per group : ", num_items_per_group)
    return out_d
コード例 #27
0
def print_top_k():
    k = 30
    info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info")
    info = json.load(open(info_save_path, "r"))
    prediction_file = at_output_dir("clue_counter_arg",
                                    "ada_aawd5_clue.4000.score")
    pred_data = join_prediction_with_info(prediction_file, info)

    simple_data = []

    text_set = set()
    for idx, e in enumerate(pred_data):
        score = logit_to_score_softmax(e['logits'])
        text = e['text']
        if text in text_set:
            continue
        text_set.add(text)
        simple_data.append((text, score))

    simple_data.sort(key=get_second, reverse=True)
    for text, score in simple_data[:k]:
        tab_print(score * 100, text)
コード例 #28
0
def load_cppnc_score(fetch_field_list=None) -> Dict[str, Dict[str, List[Dict]]]:
    save_name = "qcknc_dense_val"

    score_name_list = []
    for i in range(0, 17):
        score_name_list.append("qcknc_dense_val_{}".format(i))

    out_dir = os.path.join(output_path, "cppnc")
    exist_or_mkdir(out_dir)
    info_file_path = os.path.join(out_dir, save_name + ".info")

    print("loading json info BEGIN")
    info = load_combine_info_jsons(info_file_path, qck_convert_map)
    print("loading json info DONE")
    all_predictions = []
    for score_name in score_name_list:
        pred_file_path = os.path.join(out_dir, score_name + ".score")
        print(score_name)
        predictions = join_prediction_with_info(pred_file_path, info, fetch_field_list)
        all_predictions.extend(predictions)

    qid_grouped = group_by_qid_cid(all_predictions)
    return qid_grouped
コード例 #29
0
def main():
    config = load_run_config()
    info = load_combine_info_jsons(config['info_path'], qck_convert_map, False)
    label_d: Dict[int, List[int]] = get_claim_perspective_id_dict2()
    print("Info length:", len(info))
    predictions: List[Dict] = join_prediction_with_info(
        config['pred_path'], info)
    print("Prediction length:", len(predictions))
    out_entries: List[QCKOutEntry] = lmap(QCKOutEntry.from_dict, predictions)

    out_entries = out_entries[:10000]
    out_entries.sort(key=lambda x: logit_to_score_softmax(x.logits),
                     reverse=True)

    def get_label(entry: QCKOutEntry):
        return int(entry.candidate.id) in label_d[int(entry.query.query_id)]

    rows = []
    for entry in out_entries[:100]:
        label = get_label(entry)
        score = logit_to_score_softmax(entry.logits)
        print_info(entry, rows, score, label)
    print_table(rows)
コード例 #30
0
def do_job(input_dir, output_dir, info_dir, label_info_path, max_entries,
           job_id):

    exist_or_mkdir(output_dir)
    info_output_dir = output_dir + "_info"
    exist_or_mkdir(info_output_dir)

    label_info: List[Tuple[str, str,
                           int]] = json.load(open(label_info_path, "r"))
    label_info_d = {(str(a), str(b)): c for a, b, c in label_info}

    pred_path = os.path.join(input_dir, str(job_id) + ".score")
    #info_path = os.path.join(info_dir, str(job_id) + ".info")
    info_path = info_dir
    output_path = os.path.join(output_dir, str(job_id))
    info_output_path = os.path.join(info_output_dir, str(job_id))
    info = load_combine_info_jsons(info_path, qck_convert_map, True)
    fetch_field_list = ["vector", "data_id"]

    predictions = join_prediction_with_info(pred_path, info, fetch_field_list)

    def get_qid(entry):
        return entry['query'].query_id

    def get_candidate_id(entry):
        return entry['candidate'].id

    def pair_id(entry) -> Tuple[str, str]:
        return get_qid(entry), get_candidate_id(entry)

    groups: Dict[Tuple[str, str], List[Dict]] = group_by(predictions, pair_id)

    def get_new_entry(entries: List[Dict]):
        if not entries:
            return None
        vectors: Vectors = list([e['vector'] for e in entries])
        key = pair_id(entries[0])
        if key in label_info_d:
            label: Label = label_info_d[key]
        else:
            label: Label = 0

        return vectors, label

    g2: Dict[Tuple[str, str],
             Tuple[Vectors, Label]] = dict_value_map(get_new_entry, groups)
    base = 100 * 1000 * job_id
    max_count = 100 * 1000 * (job_id + 1)
    data_id_manager = DataIDManager(base, max_count)

    def get_out_itr() -> Iterable[Tuple[int, Tuple[Vectors, Label]]]:
        for key, data in g2.items():
            qid, cid = key
            data_info = {
                'qid': qid,
                'cid': cid,
            }
            data_id = data_id_manager.assign(data_info)
            yield data_id, data

    write_to_file(output_path, get_out_itr(), max_entries)
    json.dump(data_id_manager.id_to_info, open(info_output_path, "w"))