def run_tokenize_jobs_for_prediction_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)

    def factory(out_dir):
        return TokenizeDocTitleBodyWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group), "MSMARCO_{}_title_body_tokens".format(split), factory)
    runner.start()
Exemple #2
0
    def __init__(self, split):
        query_group: List[List[QueryID]] = load_query_group(split)
        candidate_docs_d: Dict[QueryID, List[str]] = top100_doc_ids(split)
        qrel: SimpleQrel = load_msmarco_simple_qrels(split)

        self.split = split
        self.queires = dict(load_queries(split))
        self.query_group = query_group
        self.tokenizer = get_tokenizer()
        self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
        self.qrel = qrel
        self.tokenizer = get_tokenizer()
def run_tokenize_jobs_for_pred_split(split):
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)
    max_sent_length = 64 * 5
    max_title_length = 64 * 5

    def factory(out_dir):
        return MultipleTokenizeWorker(split, query_group, candidate_docs, max_sent_length, max_title_length, out_dir)

    runner = JobRunnerS(job_man_dir, len(query_group),
                        "MSMARCO_{}_multiple_tokenize".format(split),
                        factory)
    runner.start()
Exemple #4
0
    def __init__(self, split):
        super(ProcessedResourcePredict10, self).__init__(split)

        candidate_docs_d: Dict[QueryID, List[str]] = top100_doc_ids(split)
        new_candidate_docs_d: Dict[QueryID, List[str]] = {}
        for qid, doc_ids in candidate_docs_d.items():
            pos_doc_ids = lfilter(lambda doc_id: self.get_label(qid, doc_id), doc_ids)
            neg_doc_ids = lfilter(lambda doc_id: not self.get_label(qid, doc_id), doc_ids)
            n_neg = 10 - len(pos_doc_ids)
            random.shuffle(neg_doc_ids)
            doc_ids_selected = pos_doc_ids + neg_doc_ids[:n_neg]
            assert len(doc_ids_selected) <= 10
            new_candidate_docs_d[qid] = doc_ids_selected
        self.candidate_doc_d = new_candidate_docs_d
Exemple #5
0
 def __init__(self, split):
     super(ProcessedResource100docMulti, self).__init__(split)
     candidate_docs_d: Dict[QueryID, List[str]] = top100_doc_ids(split)
     self.candidate_doc_d: Dict[QueryID, List[str]] = candidate_docs_d
from data_generator.job_runner import JobRunner
from dataset_specific.msmarco.common import load_query_group, top100_doc_ids
from dataset_specific.msmarco.tokenize_jobs.run_tokenize import DummyWorker
from epath import job_man_dir

if __name__ == "__main__":
    split = "dev"
    query_group = load_query_group(split)
    candidate_docs = top100_doc_ids(split)

    def factory(out_dir):
        return DummyWorker(split, query_group, candidate_docs, out_dir)

    runner = JobRunner(job_man_dir, len(query_group)-1, "MSMARCO_{}_tokens_debug".format(split), factory)
    runner.start()