Ejemplo n.º 1
0
def select_doc_per_query_top50(split):
    ms_reader = MSMarcoDataReader(split)
    save_path = os.path.join(root_dir, "train_docs_top50_{}.tsv".format(split))
    out_f = open(save_path, "w")

    def pop(query_id, cur_doc_ids: List[Tuple[str, int]]):
        pos_docs = ms_reader.qrel[query_id]
        neg_docs = []
        for doc_id, rank in cur_doc_ids:
            if doc_id not in pos_docs and rank < 50:
                neg_docs.append(doc_id)
        doc_needed = pos_docs + neg_docs
        row = [query_id] + doc_needed
        out_f.write("\t".join(row) + "\n")

    total_line = 36701116
    ticker = TimeEstimator(total_line, "reading", 1000)
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = []
        for line_no, line in enumerate(top100f):
            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                last_topic_id = topic_id
                cur_doc_ids = []

            ticker.tick()
            cur_doc_ids.append((doc_id, int(rank)))
Ejemplo n.º 2
0
def select_doc_per_query(split):
    ms_reader = MSMarcoDataReader(split)
    save_path = os.path.join(root_dir,
                             "train_docs_10times_{}.tsv".format(split))
    out_f = open(save_path, "w")

    def pop(query_id, cur_doc_ids: Set):
        pos_docs = ms_reader.qrel[query_id]
        neg_docs = list(
            [doc_id for doc_id in cur_doc_ids if doc_id not in pos_docs])
        if pos_docs:
            num_neg_docs = 10 * len(pos_docs)
            sel_docs = random.sample(neg_docs, num_neg_docs)
            doc_needed = pos_docs + sel_docs
            row = [query_id] + doc_needed
            out_f.write("\t".join(row) + "\n")

    total_line = 36701116
    ticker = TimeEstimator(total_line, "reading", 1000)
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = set()
        for line_no, line in enumerate(top100f):
            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                last_topic_id = topic_id
                cur_doc_ids = set()

            ticker.tick()
            cur_doc_ids.add(doc_id)
Ejemplo n.º 3
0
 def __init__(self,
              split,
              query_group,
              candidate_docs_d, out_dir):
     self.query_group = query_group
     self.tokenizer = get_tokenizer()
     self.candidate_docs_d = candidate_docs_d
     self.out_dir = out_dir
     self.ms_reader = MSMarcoDataReader(split)
Ejemplo n.º 4
0
def collect_doc_per_query(split, target_qid):
    ms_reader = MSMarcoDataReader(split)

    def pop(query_id, cur_doc_ids: Set):
        num_candidate_doc = len(cur_doc_ids)
        cur_doc_ids.update(ms_reader.qrel[query_id])
        todo = []
        for doc_id in cur_doc_ids:
            offset = ms_reader.doc_offset[doc_id]
            todo.append((doc_id, offset))
        todo.sort(key=get_second)
        num_all_docs = len(cur_doc_ids)
        print("{} docs".format(num_all_docs))

        exist_or_mkdir(per_query_root)
        save_path = get_per_query_doc_path(query_id)
        out_f = open(save_path, "w")
        for doc_id, offset in todo:
            content: str = ms_reader.get_content(doc_id)
            out_f.write(content + "\n")
        out_f.close()
###
    total_line = 36701116
    skip = True
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = set()
        for line_no, line in enumerate(top100f):
            if skip:
                if not line.startswith(target_qid):
                    continue
                else:
                    tprint("skip done")
                    remain_lines = total_line - line_no
                    ticker = TimeEstimator(remain_lines, "reading", 1000)
                    skip = False

            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                break
                last_topic_id = topic_id
                cur_doc_ids = set()

            ticker.tick()
            cur_doc_ids.add(doc_id)
        pop(last_topic_id, cur_doc_ids)
Ejemplo n.º 5
0
    def __init__(self, split, query_group, candidate_docs_d, max_sent_length,
                 max_title_length, out_dir):
        self.query_group = query_group
        self.candidate_docs_d = candidate_docs_d
        self.out_dir = out_dir
        self.bert_tokenizer = get_tokenizer()
        self.stem_tokenizer = PCTokenizer()
        self.max_sent_length = max_sent_length
        self.max_title_length = max_title_length
        self.ms_reader = MSMarcoDataReader(split)
        self.text_dir_name = 'text'
        self.bert_tokens_dir_name = 'bert_tokens'
        self.stemmed_tokens_dir_name = 'stemmed_tokens'

        for name in [
                self.text_dir_name, self.bert_tokens_dir_name,
                self.stemmed_tokens_dir_name
        ]:
            exist_or_mkdir(os.path.join(self.out_dir, name))