Ejemplo n.º 1
0
def build_tokenized_corpus(input_root,
                           tokenizer,
                           output_dir,
                           skip_dirs=False,
                           n_processes=1,
                           wiki_only=False):
    if not exists(output_dir):
        makedirs(output_dir)

    all_files = _gather_files(input_root, output_dir, skip_dirs, wiki_only)

    if n_processes == 1:
        voc = build_tokenized_files(tqdm(all_files, ncols=80), input_root,
                                    output_dir, tokenizer)
    else:
        voc = set()
        from multiprocessing import Pool
        with Pool(n_processes) as pool:
            chunks = split(all_files, n_processes)
            chunks = flatten_iterable(group(c, 500) for c in chunks)
            pbar = tqdm(total=len(chunks), ncols=80)
            for v in pool.imap_unordered(
                    _build_tokenized_files_t,
                [[c, input_root, output_dir, tokenizer] for c in chunks]):
                voc.update(v)
                pbar.update(1)
            pbar.close()

    voc_file = join(output_dir, "vocab.txt")
    with open(voc_file, "w") as f:
        for word in sorted(voc):
            f.write(word)
            f.write("\n")
Ejemplo n.º 2
0
    def _build_dataset(cls, corpus_name, n_processes, train_file: str,
                       dev_file: str):
        hotpotqa = cls(corpus_name=corpus_name)

        with open(join(hotpotqa.dir, train_file), "rt") as f_train:
            _raw_train = json.load(f_train)

        with open(join(hotpotqa.dir, dev_file), "rt") as f_dev:
            _raw_dev = json.load(f_dev)

        dataset = {'train': _raw_train, 'dev': _raw_dev}
        for d in dataset:
            with Pool(n_processes) as pool, tqdm(total=len(dataset[d]),
                                                 desc=d,
                                                 ncols=70) as pbar:
                tqdm.write(bcolors.OKBLUE +
                           "[+] Preprocess for {} set".format(d) +
                           bcolors.ENDC)
                chunks = split(dataset[d], n_processes)

                for questions in pool.starmap(
                        hotpotqa._build_question,
                    [[c, hotpotqa.tokenizer, hotpotqa.detector]
                     for c in chunks]):
                    pbar.update(len(questions))
                    if d == 'train':
                        hotpotqa._train += questions
                    elif d == 'dev':
                        hotpotqa._dev += questions
        hotpotqa._train = FilteredData(hotpotqa._train, len(hotpotqa._train))
        hotpotqa._dev = FilteredData(hotpotqa._dev, len(hotpotqa._dev))

        return hotpotqa
Ejemplo n.º 3
0
def compute_answer_spans_par(questions: List[TriviaQaQuestion], corpus,
                             tokenizer, detector, n_processes: int):
    if n_processes == 1:
        word_tokenize = tokenizer.tokenize_paragraph_flat
        compute_answer_spans(questions, corpus, word_tokenize, detector)
        return questions
    from multiprocessing import Pool
    with Pool(n_processes) as p:
        chunks = split(questions, n_processes)
        questions = flatten_iterable(
            p.starmap(_compute_answer_spans_chunk,
                      [[c, corpus, tokenizer, detector] for c in chunks]))
        return questions
Ejemplo n.º 4
0
def preprocess_par(questions: List,
                   evidence,
                   preprocessor,
                   n_processes=2,
                   chunk_size=200,
                   name=None):
    if chunk_size <= 0:
        raise ValueError("Chunk size must be >= 0, but got %s" % chunk_size)
    if n_processes is not None and n_processes <= 0:
        raise ValueError("n_processes must be >= 1 or None, but got %s" %
                         n_processes)
    n_processes = min(len(questions), n_processes)

    if n_processes == 1:
        out = preprocessor.preprocess(tqdm(questions, desc=name, ncols=80),
                                      evidence)
        preprocessor.finalize_chunk(out)
        return out
    else:
        from multiprocessing import Pool
        chunks = split(questions, n_processes)
        chunks = flatten_iterable([group(c, chunk_size) for c in chunks])
        print("Processing %d chunks with %d processes" %
              (len(chunks), n_processes))
        pbar = tqdm(total=len(questions), desc=name, ncols=80)
        lock = Lock()

        def call_back(results):
            preprocessor.finalize_chunk(results[0])
            with lock:  # FIXME Even with the lock, the progress bar still is jumping around
                pbar.update(results[1])

        with Pool(n_processes) as pool:
            results = [
                pool.apply_async(_preprocess_and_count,
                                 [c, evidence, preprocessor],
                                 callback=call_back) for c in chunks
            ]
            results = [r.get()[0] for r in results]

        pbar.close()
        output = results[0]
        for r in results[1:]:
            output += r
        return output
Ejemplo n.º 5
0
def get_evidence_voc(corpus, n_processes=1):
    doc_ids = corpus.list_documents()
    voc = Counter()

    if n_processes == 1:
        for doc in tqdm(doc_ids):
            voc = corpus.get_document(doc, flat=True)
    else:
        from multiprocessing import Pool
        chunks = split(doc_ids, n_processes)
        chunks = flatten_iterable(group(x, 10000) for x in chunks)
        pbar = tqdm(total=len(chunks), ncols=80)
        with Pool(n_processes) as pool:
            for v in pool.imap_unordered(_extract_voc_tuple, [[corpus, c] for c in chunks]):
                voc += v
                pbar.update(1)
        pbar.close()

    return voc