def apply_bpe(bpe_path, path, output_path, buffer=1000): ''' Parse a file asynchronously ''' with tempfile.TemporaryDirectory() as tmpdir: paths = [] results = [] pool = Pool() vocab = set() basename = os.path.basename(output_path) file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer) for chunk in sorted(file_chunks): output_chunk = f'{chunk}{basename}' results.append(pool.apply_async(_apply_bpe, [bpe_path, chunk, output_chunk])) paths.append(output_chunk) pool.close() results = tqdm( results, unit='chunk', dynamic_ncols=True, desc=f'BPE encoding {basename}', file=sys.stdout # needed to make tqdm_wrap_stdout work ) with tqdm_wrap_stdout(): for result in results: vocab.update(result.get()) pool.join() file_utils.join(paths, output_path) return vocab
def tokenize(path, output_path, buffer=1000): ''' Parse a file asynchronously ''' with tempfile.TemporaryDirectory() as tmpdir: paths = [] results = [] pool = Pool() word_counts = Counter() basename = os.path.basename(output_path) language = os.path.splitext(basename)[1][1:] file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer) for chunk in sorted(file_chunks): output_chunk = f'{chunk}{basename}' results.append(pool.apply_async(_tokenize, [language, chunk, output_chunk])) paths.append(output_chunk) pool.close() results = tqdm( results, unit='chunk', dynamic_ncols=True, desc=f'Tokenizing {basename}', file=sys.stdout # needed to make tqdm_wrap_stdout work ) with tqdm_wrap_stdout(): for result in results: word_counts += result.get() pool.join() file_utils.join(paths, output_path) return word_counts
def parse(path, output_path, buffer=1000): ''' Parse a file asynchronously ''' with tempfile.TemporaryDirectory() as tmpdir: paths = [] results = [] pool = Pool() basename = os.path.basename(path) file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer) for chunk in sorted(file_chunks): renamed_chunk = f'{chunk}{basename}' os.rename(chunk, renamed_chunk) results.append(pool.apply_async(_parse, [renamed_chunk])) paths.append(renamed_chunk.replace('bpe.32000.', '') + '.parse') pool.close() results = tqdm( results, unit='chunk', dynamic_ncols=True, desc=f'Parsing {basename}', file=sys.stdout # needed to make tqdm_wrap_stdout work ) with tqdm_wrap_stdout(): for result in results: result.get() pool.join() file_utils.join(paths, output_path)