def test_opening_multiple_auxiliary_files_at_once(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write(f'{i}th file') with AuxiliaryFile.opens(files, 'r') as fps: for i, fp in enumerate(fps): assert fp.read() == f'{i}th file'
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: # Calculate the optimum stride and bucket size. stride = max(1, self._total_lines_in_file(corpus) // self.best_seek_cnt) buckets = [ afm.create() for _ in range(min(stride * 2, self.max_buckets)) ] # Collect the corresponding seeking positions and shuffle them. offsets = self._collect_seek_offsets(corpus, stride) random.shuffle(offsets) with corpus.open('rb') as src, \ AuxiliaryFile.opens(buckets, 'wb') as dsts: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm( offsets, desc=colorful.render('<r>[*]</r> shuffle raw corpus file')) for offset in tqdm_iter: src.seek(offset) for _ in range(stride): line = src.readline() if not line: break # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' # Write the decorated text line to the random bucket for # ensuring randomness. dsts[random.randint(0, len(dsts) - 1)].write(line) # After splitting to the buckets, merge them into a single file. merged = afm.create() with merged.open('wb') as dst, \ AuxiliaryFile.opens(buckets, 'rb') as srcs: for src in srcs: shutil.copyfileobj(src, dst) return merged
def test_merging_files_without_loss_of_contents(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write(f'{i}\n' * 100) with MergeFiles().build(afm, *files).open('r') as fp: assert (fp.read().split() == [ str(i) for i in range(10) for _ in range(100) ])
def test_if_builder_adds_break_lines_automatically(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write('\n'.join([str(i) for _ in range(100)])) with MergeFiles().build(afm, *files).open('r') as fp: assert (fp.read().split() == [ str(i) for i in range(10) for _ in range(100) ])
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> AuxiliaryFile: merged = afm.create() print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files ' f'into one')) with merged.open('wb') as dst, \ AuxiliaryFile.opens(inputs, 'rb') as srcs: for src in srcs: for line in src: # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' dst.write(line) return merged
def build(self, afm, *inputs): assert len(self.texts) == len(inputs) with AuxiliaryFile.opens(inputs, 'r') as fps: for text, fp in zip(self.texts, fps): assert fp.read() == text
def build(self, afm, *inputs): files = [afm.create() for _ in self.texts] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write(self.texts[i]) return tuple(files)