Beispiel #1
0
def test_opening_multiple_auxiliary_files_at_once():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]

        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}th file')

        with AuxiliaryFile.opens(files, 'r') as fps:
            for i, fp in enumerate(fps):
                assert fp.read() == f'{i}th file'
Beispiel #2
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged
Beispiel #3
0
def test_merging_files_without_loss_of_contents():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write(f'{i}\n' * 100)

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])
Beispiel #4
0
def test_if_builder_adds_break_lines_automatically():
    with tempfile.TemporaryDirectory() as tdir, \
            AuxiliaryFileManager(f'{tdir}/workspace') as afm:
        files = [afm.create() for _ in range(10)]
        with AuxiliaryFile.opens(files, 'w') as fps:
            for i, fp in enumerate(fps):
                fp.write('\n'.join([str(i) for _ in range(100)]))

        with MergeFiles().build(afm, *files).open('r') as fp:
            assert (fp.read().split() == [
                str(i) for i in range(10) for _ in range(100)
            ])
Beispiel #5
0
    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> AuxiliaryFile:
        merged = afm.create()

        print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files '
                              f'into one'))
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(inputs, 'rb') as srcs:
            for src in srcs:
                for line in src:
                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    dst.write(line)

        return merged
Beispiel #6
0
    def build(self, afm, *inputs):
        assert len(self.texts) == len(inputs)

        with AuxiliaryFile.opens(inputs, 'r') as fps:
            for text, fp in zip(self.texts, fps):
                assert fp.read() == text
Beispiel #7
0
 def build(self, afm, *inputs):
     files = [afm.create() for _ in self.texts]
     with AuxiliaryFile.opens(files, 'w') as fps:
         for i, fp in enumerate(fps):
             fp.write(self.texts[i])
     return tuple(files)