def build_tokenized_corpus(input_root, tokenizer, output_dir, skip_dirs=False, n_processes=1, wiki_only=False): if not exists(output_dir): makedirs(output_dir) all_files = _gather_files(input_root, output_dir, skip_dirs, wiki_only) if n_processes == 1: voc = build_tokenized_files(tqdm(all_files, ncols=80), input_root, output_dir, tokenizer) else: voc = set() from multiprocessing import Pool with Pool(n_processes) as pool: chunks = split(all_files, n_processes) chunks = flatten_iterable(group(c, 500) for c in chunks) pbar = tqdm(total=len(chunks), ncols=80) for v in pool.imap_unordered( _build_tokenized_files_t, [[c, input_root, output_dir, tokenizer] for c in chunks]): voc.update(v) pbar.update(1) pbar.close() voc_file = join(output_dir, "vocab.txt") with open(voc_file, "w") as f: for word in sorted(voc): f.write(word) f.write("\n")
def preprocess_par(questions: List, evidence, preprocessor, n_processes=2, chunk_size=200, name=None): if chunk_size <= 0: raise ValueError("Chunk size must be >= 0, but got %s" % chunk_size) if n_processes is not None and n_processes <= 0: raise ValueError("n_processes must be >= 1 or None, but got %s" % n_processes) n_processes = min(len(questions), n_processes) if n_processes == 1: out = preprocessor.preprocess(tqdm(questions, desc=name, ncols=80), evidence) preprocessor.finalize_chunk(out) return out else: from multiprocessing import Pool chunks = split(questions, n_processes) chunks = flatten_iterable([group(c, chunk_size) for c in chunks]) print("Processing %d chunks with %d processes" % (len(chunks), n_processes)) pbar = tqdm(total=len(questions), desc=name, ncols=80) lock = Lock() def call_back(results): preprocessor.finalize_chunk(results[0]) with lock: # FIXME Even with the lock, the progress bar still is jumping around pbar.update(results[1]) with Pool(n_processes) as pool: results = [ pool.apply_async(_preprocess_and_count, [c, evidence, preprocessor], callback=call_back) for c in chunks ] results = [r.get()[0] for r in results] pbar.close() output = results[0] for r in results[1:]: output += r return output
def get_evidence_voc(corpus, n_processes=1): doc_ids = corpus.list_documents() voc = Counter() if n_processes == 1: for doc in tqdm(doc_ids): voc = corpus.get_document(doc, flat=True) else: from multiprocessing import Pool chunks = split(doc_ids, n_processes) chunks = flatten_iterable(group(x, 10000) for x in chunks) pbar = tqdm(total=len(chunks), ncols=80) with Pool(n_processes) as pool: for v in pool.imap_unordered(_extract_voc_tuple, [[corpus, c] for c in chunks]): voc += v pbar.update(1) pbar.close() return voc