Ejemplo n.º 1
0
def tokenize(path, output_path, buffer=1000):
    ''' Parse a file asynchronously '''
    with tempfile.TemporaryDirectory() as tmpdir:
        paths = []
        results = []
        pool = Pool()
        word_counts = Counter()
        basename = os.path.basename(output_path)
        language = os.path.splitext(basename)[1][1:]
        file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer)
        for chunk in sorted(file_chunks):
            output_chunk = f'{chunk}{basename}'
            results.append(pool.apply_async(_tokenize, [language, chunk, output_chunk]))
            paths.append(output_chunk)
        pool.close()

        results = tqdm(
            results,
            unit='chunk',
            dynamic_ncols=True,
            desc=f'Tokenizing {basename}',
            file=sys.stdout # needed to make tqdm_wrap_stdout work
        )
        with tqdm_wrap_stdout():
            for result in results:       
                word_counts += result.get()

            pool.join()

        file_utils.join(paths, output_path)
        return word_counts
Ejemplo n.º 2
0
def get_parse_vocab(path, segmenters, buffer=1000):
    ''' Parse a file asynchronously '''
    with tempfile.TemporaryDirectory() as tmpdir:
        results = []
        pool = Pool()
        vocab = set()
        basename = os.path.basename(path)
        file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer)
        for chunk in sorted(file_chunks):
            results.append(
                pool.apply_async(_get_parse_vocab, [chunk, segmenters]))
        pool.close()

        results = tqdm(
            results,
            unit='chunk',
            dynamic_ncols=True,
            desc=f'Extracting vocab: {basename}',
            file=sys.stdout  # needed to make tqdm_wrap_stdout work
        )
        with tqdm_wrap_stdout():
            for result in results:
                vocab.update(result.get())

            pool.join()

        return vocab
Ejemplo n.º 3
0
def apply_bpe(bpe_path, path, output_path, buffer=1000):
    ''' Parse a file asynchronously '''
    with tempfile.TemporaryDirectory() as tmpdir:
        paths = []
        results = []
        pool = Pool()
        vocab = set()
        basename = os.path.basename(output_path)
        file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer)
        for chunk in sorted(file_chunks):
            output_chunk = f'{chunk}{basename}'
            results.append(pool.apply_async(_apply_bpe, [bpe_path, chunk, output_chunk]))
            paths.append(output_chunk)
        pool.close()

        results = tqdm(
            results,
            unit='chunk',
            dynamic_ncols=True,
            desc=f'BPE encoding {basename}',
            file=sys.stdout # needed to make tqdm_wrap_stdout work
        )
        with tqdm_wrap_stdout():
            for result in results:
                vocab.update(result.get())

            pool.join()

        file_utils.join(paths, output_path)
        return vocab
Ejemplo n.º 4
0
def parse(path, output_path, buffer=1000):
    ''' Parse a file asynchronously '''
    with tempfile.TemporaryDirectory() as tmpdir:
        paths = []
        results = []
        pool = Pool()
        basename = os.path.basename(path)
        file_chunks = file_utils.split(path, os.path.join(tmpdir, ''), buffer)
        for chunk in sorted(file_chunks):
            renamed_chunk = f'{chunk}{basename}'

            os.rename(chunk, renamed_chunk)
            results.append(pool.apply_async(_parse, [renamed_chunk]))
            paths.append(renamed_chunk.replace('bpe.32000.', '') + '.parse')
        pool.close()

        results = tqdm(
            results,
            unit='chunk',
            dynamic_ncols=True,
            desc=f'Parsing {basename}',
            file=sys.stdout  # needed to make tqdm_wrap_stdout work
        )
        with tqdm_wrap_stdout():
            for result in results:
                result.get()

            pool.join()

        file_utils.join(paths, output_path)