Esempio n. 1
0
def select_window(score: List[float], window_size,
                  n_seg_per_doc) -> List[Tuple[int, int]]:
    assert len(score) >= window_size * n_seg_per_doc
    line_len = 16
    n_line = ceil_divide(len(score), line_len)
    line_scores: List[float] = list(
        [sum(score[i * line_len:(i + 1) * line_len]) for i in range(n_line)])
    idx_sorted = argsort(line_scores)[::-1]
    line_per_window = ceil_divide(window_size, line_len)
    best_idx = idx_sorted[0]
    selected_windows = []
    for _ in range(n_seg_per_doc):
        candidate_list = []
        for i in range(line_per_window):
            st = best_idx - i
            ed = best_idx - i + line_per_window
            score_sum = sum(line_scores[st:ed])
            e = (st, ed, score_sum)
            candidate_list.append(e)

        candidate_list.sort(key=lambda x: x[2], reverse=True)
        st, ed, _ = candidate_list[0]

        for j in range(st, ed):
            line_scores[j] = 0

        selected_windows.append((st, ed))

    selected_windows.sort(key=get_first)
    return selected_windows
Esempio n. 2
0
    def encode(self, query_id: str,
               doc_id: str) -> Iterable[TokensAndSegmentIds]:
        try:
            sp_list: List[
                ScoredPieceFromPair] = self.piece_score_parser.get_piece_score(
                    query_id, doc_id)
            query = self.queries[str(query_id)]
            query_tokens: List[str] = self.tokenizer.tokenize(query)
            q_term_len = len(query_tokens)
            available_length = self.max_seq_length - q_term_len - 4
            maybe_doc_length = self.probe_config.n_factor * self.probe_config.step_size
            n_piece = ceil_divide(maybe_doc_length,
                                  self.probe_config.max_seq_length) * 2
            n1 = n_piece
            n2 = max(n_piece - 1, 1)

            two_piece_list: Iterable[
                PiecewiseSegment] = select_many_two_piece_segment(
                    self.probe_config, available_length, sp_list, n1, n2)

            def format_as_tas(two_piece):
                return to_tokens_and_segment_ids(query_tokens, sp_list,
                                                 two_piece,
                                                 self.max_seq_length,
                                                 self.use_many_seg_ids)

            return map(format_as_tas, two_piece_list)
        except KeyError:
            return []
Esempio n. 3
0
def main():
    # counter = get_doc_length_counter()
    # save_to_pickle(counter, "robust_doc_length_counter")
    counter: Counter = get_doc_length_counter_from_pickle()
    seg_length = 500

    all_keys = list(counter.keys())
    all_keys.sort()

    num_seg_count = Counter()
    for l in all_keys:
        num_seg = ceil_divide(l, seg_length)
        cnt = counter[l]
        assert type(cnt) == int
        num_seg_count[num_seg] += cnt

    num_docs = sum(counter.values())
    acc_portion = 0
    for key in sorted(num_seg_count.keys()):
        cnt = num_seg_count[key]
        assert type(cnt) == int
        portion = cnt / num_docs
        acc_portion += portion
        # print("{0}\t{1}\t{2:.2f}\t{3:.2f}".format(key, cnt, portion, acc_portion))
        print("{0}\t{1}\t{2:.4f}\t{3:.4f}".format(key, cnt, portion,
                                                  acc_portion))
Esempio n. 4
0
def select_many_two_piece_segment(probe_config, available_length, sp_list: List[ScoredPieceFromPair], n1, n2)\
        -> Iterable[PiecewiseSegment]:
    seg1_num_piece = ceil_divide(ceil_divide(available_length, 2),
                                 probe_config.step_size)
    seg1_length = seg1_num_piece * probe_config.step_size
    # Select first segment

    first_piece_candidates: List[Tuple[ScoredInterval,
                                       float]] = make_first_piece_candidates(
                                           seg1_num_piece, sp_list)
    seg2_length = available_length - seg1_length
    seg2_num_seg = int(seg2_length / probe_config.step_size)
    for i1 in range(min(n1, len(first_piece_candidates))):
        first_piece, _ = first_piece_candidates[i1]
        second_piece_candidates: List[Tuple[ScoredInterval, float]] = \
            make_second_piece_candidates(seg2_num_seg, sp_list, first_piece)
        for i2 in range(min(n2, len(second_piece_candidates))):
            second_piece, _ = second_piece_candidates[i2]
            output_piece_list = combine_interval(first_piece, second_piece)
            yield output_piece_list
Esempio n. 5
0
def select_a_two_piece_segment(
        probe_config, available_length,
        sp_list: List[ScoredPieceFromPair]) -> PiecewiseSegment:
    seg1_num_piece = ceil_divide(ceil_divide(available_length, 2),
                                 probe_config.step_size)
    seg1_length = seg1_num_piece * probe_config.step_size
    # Select first segment

    first_piece_candidates: List[Tuple[ScoredInterval,
                                       float]] = make_first_piece_candidates(
                                           seg1_num_piece, sp_list)
    first_piece, _ = first_piece_candidates[0]

    seg2_length = available_length - seg1_length
    seg2_num_seg = int(seg2_length / probe_config.step_size)

    second_piece_candidates: List[Tuple[ScoredInterval,
                                        float]] = make_second_piece_candidates(
                                            seg2_num_seg, sp_list, first_piece)
    second_piece, _ = second_piece_candidates[0]

    return combine_interval(first_piece, second_piece)
Esempio n. 6
0
def get_query_split():
    xml_path = "/mnt/nfs/work3/youngwookim/code/Chair/data/CLEFeHealth2017IRtask/queries/queries2016.xml"
    queries: List[Query] = load_xml_query(xml_path)

    n_query = len(queries)

    n_split = 5
    split_size = ceil_divide(n_query, n_split)

    cut = (n_split-1) * split_size
    train_queries = queries[:cut]
    test_queries = queries[cut:]

    return train_queries, test_queries
Esempio n. 7
0
    def __init__(self, option, out_dir):
        self.out_dir = out_dir
        self.ci = RankedListInterface()
        print("load__data_point")
        self.all_data_points: List[TPDataPoint] = lmap(ukp_datapoint_to_tp_datapoint, load_all_data_flat())
        self.data_step_size = 50

        total_jobs = ceil_divide(len(self.all_data_points), self.data_step_size)
        print("total_jobs :", total_jobs )
        print("Load term stat")
        _, clue12_13_df = load_clueweb12_B13_termstat()
        self.clue12_13_df = clue12_13_df
        self.dp_id_to_q_res_id_fn = build_dp_id_to_q_res_id_fn()
        self.tokenizer = get_tokenizer()
        self.option = option
Esempio n. 8
0
def serialize(features_list):
    num_worker = 4
    output = []
    with ProcessPoolExecutor(max_workers=num_worker) as executor:
        future_list = []
        job_per_worker = ceil_divide(len(features_list), num_worker)
        for idx in range(num_worker):
            st = idx * job_per_worker
            ed = (idx + 1) * job_per_worker
            future = executor.submit(enc, features_list[st:ed])
            future_list.append(future)

        for future in future_list:
            sub_outputs = future.result()
            output.extend(sub_outputs)
    return output
Esempio n. 9
0
def main():
    is_correct_fn = get_is_correct_fn()
    for split in splits[:2]:
        qk_candidate = load_from_pickle("pc_evi_filtered_qk_{}".format(split))
        qk_candidate = sample_kdps(qk_candidate)
        tprint("Loading candidates..")
        candidate_dict = load_bal_candidate(split)
        tprint("{} dict keys".format(len(candidate_dict)))

        tprint("Initializing generator..")
        generator = QCKInstanceGenerator(candidate_dict, is_correct_fn)
        n_qk_per_job = 10
        num_jobs = ceil_divide(d_n_pc_per_split[split], n_qk_per_job)

        def worker_factory(out_dir):
            worker = QCKWorkerMultiple(qk_candidate, generator, n_qk_per_job, out_dir)
            return worker

        job_name = "pc_evi_qck3_{}".format(split)
        runner = JobRunnerS(job_man_dir, num_jobs, job_name, worker_factory)
        runner.start()
Esempio n. 10
0
def qck_gen_w_ranked_list_multiple(job_name, qk_candidate_name,
                                   ranked_list_path, split, n_qk_per_job):
    claim_ids = load_claim_ids_for_split(split)
    cids: List[str] = lmap(str, claim_ids)
    qk_candidate: List[QKUnit] = load_from_pickle(qk_candidate_name)
    print("cids", len(cids))
    print("len(qk_candidate)", len(qk_candidate))
    print("Generate instances : ", split)
    generator = QCKInstanceGenerator(
        get_qck_candidate_from_ranked_list_path(ranked_list_path),
        is_correct_factory())
    qk_candidate_train: List[QKUnit] = list(
        [qk for qk in qk_candidate if qk[0].query_id in cids])

    def worker_factory(out_dir):
        return QCKWorkerMultiple(qk_candidate_train, generator, n_qk_per_job,
                                 out_dir)

    num_qks = d_n_claims_per_split2[split]
    num_jobs = ceil_divide(num_qks, n_qk_per_job)
    runner = JobRunnerS(job_man_dir, num_jobs, job_name + "_" + split,
                        worker_factory)
    runner.start()
Esempio n. 11
0
from data_generator.job_runner import JobRunner, sydney_working_dir
###
from galagos.process_jsonl_doc_lines import JsonlWorker
from misc_lib import ceil_divide

if __name__ == "__main__":
    num_lines = 231423
    block_size = 100
    num_jobs = ceil_divide(num_lines, block_size)
    jsonl_path = "/mnt/nfs/work3/youngwookim/data/counter_arg/q_res/ca_docs.jsonl"
    print("Start")
    runner = JobRunner(sydney_working_dir, num_jobs - 1, "ca_docs",
                       lambda out_dir: JsonlWorker(jsonl_path, out_dir))
    runner.start()