Example #1
0
def gen_for_split(split):
    resource = ProcessedResourcePredict(split)
    max_seq_length = 512
    basic_encoder = LeadingNWithRandomShort(max_seq_length, 20)
    generator = PredictionAllPassageGenerator(resource, basic_encoder,
                                              max_seq_length)

    def factory(out_dir):
        return MMDWorker(resource.query_group, generator, out_dir)

    runner = JobRunner(job_man_dir,
                       len(resource.query_group) - 1,
                       "MMD_pred_{}_random_short".format(split), factory)
    runner.start()
Example #2
0
def do_for_split(split):
    if split == "train":
        resource = ProcessedResource10doc("train")
    else:
        resource = ProcessedResourcePredict(split)

    srm = SegResourceMaker(resource, 512, max_seg_per_doc=40)

    def factory(out_dir):
        return SegResourceWriterWorker(resource.query_group, srm, out_dir)

    runner = JobRunner(job_man_dir,
                       len(resource.query_group) - 1,
                       "seg_resource_{}".format(split), factory)
    runner.start()
Example #3
0
from tlm.data_gen.doc_encode_common import seg_selection_by_geo_sampling
from tlm.data_gen.msmarco_doc_gen.gen_worker import MMDWorker
from tlm.data_gen.msmarco_doc_gen.gen_worker_sent_level import PredictionGen
from tlm.data_gen.msmarco_doc_gen.processed_resource import ProcessedResource, ProcessedResource10doc, \
    ProcessedResourcePredict


def get_first_ten(items):
    for idx, item in enumerate(items):
        if idx < 10:
            yield


if __name__ == "__main__":
    split = "test"
    resource = ProcessedResourcePredict(split)
    max_seq_length = 512
    document_encoder = FromTextEncoder(max_seq_length,
                                       random_short=True,
                                       seg_selection_fn=None,
                                       max_seg_per_doc=20)
    generator = PredictionGen(resource, document_encoder, max_seq_length)

    def factory(out_dir):
        return MMDWorker(resource.query_group, generator, out_dir)

    runner = JobRunner(job_man_dir,
                       len(resource.query_group) - 1,
                       "MMD_{}_sent_split".format(split), factory)
    runner.start()