def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> Union[None, AuxiliaryFile, Tuple[AuxiliaryFile, ...]]: with afm.auxiliary_scope(): outputs = inputs for builder in self.builders: outputs = builder.build(afm, *outputs) # Lock output auxiliary files to protect from deleting for # passing to next builder. if isinstance(outputs, AuxiliaryFile): outputs.lock() outputs = (outputs,) elif isinstance(outputs, tuple): for af in outputs: if not isinstance(af, AuxiliaryFile): raise TypeError(f'element {type(af)} is not an ' f'auxiliary file.') af.lock() elif outputs is None: outputs = tuple() else: # If the output of builder is not one of the allowed types # (auxiliary file, tuple of auxiliary files, and None) then # throw exception. raise TypeError(f'output type {type(outputs)} from ' f'builder is not allowed.') # Delete all unnecessary files except inputs and locked files. afm.clear() return outputs
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> Tuple[AuxiliaryFile, ...]: outputs = tuple() for builder in self.builders: outputs += builder.build(afm, *inputs) afm.synchronize(outputs) # Lock input and stacked output auxiliary files. for af in inputs + outputs: af.lock() afm.clear() return outputs
def test_shuffling_preserves_contents(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write('\n'.join([str(i) for i in range(1000)]) + '\n') # In this case, the size of each chunk is 1 and it implies `complete # random shuffling`. with (ShuffleLines(best_seek_cnt=1000, max_buckets=512).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000)) # If `best_seek_cnt` is less than the entire text lines, then the # shuffling would be approximated by using chunks and their buckets. with (ShuffleLines(best_seek_cnt=100, max_buckets=512).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000)) with (ShuffleLines(best_seek_cnt=10, max_buckets=512).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000)) # However, if `max_buckets` is less than the optimum bucket size (twice # of the optimum stride size), then only `max_buckets` of buckets would # be used. Note that this case leads reduction of randomness. with (ShuffleLines(best_seek_cnt=10, max_buckets=64).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000))
def test_builder_splits_corpus_without_loss_of_contents(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write('\n'.join(str(i) for i in range(1000))) # Test the splitting builder with various ratios. tfile, vfile = SplitValidation(val_ratio=0.1).build(afm, corpus) with tfile.open('r') as tfp, vfile.open('r') as vfp: assert ([int(s.strip()) for s in vfp.readlines()] == list(range(100))) assert ([int(s.strip()) for s in tfp.readlines()] == list(range(100, 1000))) tfile, vfile = SplitValidation(val_ratio=0.27).build(afm, corpus) with tfile.open('r') as tfp, vfile.open('r') as vfp: assert ([int(s.strip()) for s in vfp.readlines()] == list(range(270))) assert ([int(s.strip()) for s in tfp.readlines()] == list(range(270, 1000))) tfile, vfile = SplitValidation(val_ratio=0.1387).build(afm, corpus) with tfile.open('r') as tfp, vfile.open('r') as vfp: assert ([int(s.strip()) for s in vfp.readlines()] == list(range(139))) assert ([int(s.strip()) for s in tfp.readlines()] == list(range(139, 1000)))
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def run(self, parent: str): with AuxiliaryFileManager(parent) as afm: self.build(afm) # After running the build pipeline, delete all created dummy files # even though the remainders would be removed in `__exit__` of the # manager. afm.clear()
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: # Calculate the optimum stride and bucket size. stride = max(1, self._total_lines_in_file(corpus) // self.best_seek_cnt) buckets = [ afm.create() for _ in range(min(stride * 2, self.max_buckets)) ] # Collect the corresponding seeking positions and shuffle them. offsets = self._collect_seek_offsets(corpus, stride) random.shuffle(offsets) with corpus.open('rb') as src, \ AuxiliaryFile.opens(buckets, 'wb') as dsts: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm( offsets, desc=colorful.render('<r>[*]</r> shuffle raw corpus file')) for offset in tqdm_iter: src.seek(offset) for _ in range(stride): line = src.readline() if not line: break # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' # Write the decorated text line to the random bucket for # ensuring randomness. dsts[random.randint(0, len(dsts) - 1)].write(line) # After splitting to the buckets, merge them into a single file. merged = afm.create() with merged.open('wb') as dst, \ AuxiliaryFile.opens(buckets, 'rb') as srcs: for src in srcs: shutil.copyfileobj(src, dst) return merged
def test_builder_collects_seeking_positions_correctly(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: # Create an auxiliary file with 1000 dummy lines. corpus = afm.create() with corpus.open('w') as fp: fp.write('hello world!\n' * 1000) builder = ShuffleLines() for s in range(1, 200): assert len(builder._collect_seek_offsets(corpus, s)) == 1000 // s
def test_opening_multiple_auxiliary_files_at_once(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write(f'{i}th file') with AuxiliaryFile.opens(files, 'r') as fps: for i, fp in enumerate(fps): assert fp.read() == f'{i}th file'
def test_if_builder_adds_break_lines_automatically(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write('\n'.join([str(i) for _ in range(100)])) with MergeFiles().build(afm, *files).open('r') as fp: assert (fp.read().split() == [ str(i) for i in range(10) for _ in range(100) ])
def test_merging_files_without_loss_of_contents(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: files = [afm.create() for _ in range(10)] with AuxiliaryFile.opens(files, 'w') as fps: for i, fp in enumerate(fps): fp.write(f'{i}\n' * 100) with MergeFiles().build(afm, *files).open('r') as fp: assert (fp.read().split() == [ str(i) for i in range(10) for _ in range(100) ])
def test_afm_creates_files_correctly(): with tempfile.TemporaryDirectory() as tdir: with AuxiliaryFileManager(f'{tdir}/workspace') as afm: tfile = afm.create() with tfile.open('w') as fp: fp.write('hello world!') with tfile.open('r') as fp: assert fp.read() == 'hello world!' assert os.path.exists(tfile.name) assert _number_of_files_in_directory(f'{tdir}/workspace') == 1 assert not os.path.exists(f'{tdir}/workspace')
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]: train_dataset = afm.create() val_dataset = afm.create() total_lines = self._total_lines_in_file(corpus) print( colorful.render(f'<r>[*]</r> split validation corpus - ' f'<m>{math.ceil(total_lines * self.val_ratio)}' f'</m> of <m>{total_lines}</m> lines')) with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \ val_dataset.open('wb') as vdst: # Write the first `val_ratio` lines to the validation dataset file. for i, line in enumerate(src): vdst.write(line) if i + 1 >= total_lines * self.val_ratio: break # After writing the validation dataset, copy the entire rest lines # to the train dataset. shutil.copyfileobj(src, tdst) return train_dataset, val_dataset
def _create_subset_file(self, afm: AuxiliaryFileManager, af: AuxiliaryFile) -> AuxiliaryFile: subset = afm.create() with af.open('rb') as src, subset.open('wb') as dst: while True: line = src.readline() if not line: break dst.write(line) # If total amount of copied data is more than `subset_size` # then stop copying data to the subset file. if src.tell() > self.subset_size: break return subset
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> AuxiliaryFile: merged = afm.create() print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files ' f'into one')) with merged.open('wb') as dst, \ AuxiliaryFile.opens(inputs, 'rb') as srcs: for src in srcs: for line in src: # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' dst.write(line) return merged
def test_shuffling_without_break_line_in_last(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: # Note that we would not add break-line character to the end of the # content. fp.write('\n'.join([str(i) for i in range(1000)])) with (ShuffleLines(best_seek_cnt=1000, max_buckets=512).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000)) with (ShuffleLines(best_seek_cnt=10, max_buckets=64).build(afm, corpus).open('r')) as fp: assert {int(i) for i in fp.read().split()} == set(range(1000))
def test_subword_tokenization(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write(_dummy_corpus_content) # Train WordPiece vocabulary and tokenize sentences. vocab = (TrainTokenizer(vocab_size=128, limit_alphabet=64).build(afm, corpus)) tokenized = (TokenizeSentences(unk_token='[UNK]').build( afm, corpus, vocab)) # Test if the tokenization is correctly applied to the corpus. Note # that the tokenizer model will normalize the sentences. with tokenized.open('r') as fp: assert (fp.read().strip().replace('##', '').replace( ' ', '') == _dummy_corpus_content.lower().replace(' ', ''))
def test_subset_file_creation(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write('hello world!\n' * 100) with (TrainTokenizer(subset_size=1024)._create_subset_file( afm, corpus).open('r')) as fp: assert len(fp.readlines()) == 79 with (TrainTokenizer(subset_size=128)._create_subset_file( afm, corpus).open('r')) as fp: assert len(fp.readlines()) == 10 with (TrainTokenizer(subset_size=2000)._create_subset_file( afm, corpus).open('r')) as fp: assert len(fp.readlines()) == 100
def test_counting_lines_in_file(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: builder = SplitValidation() corpus = afm.create() # Test for the case of 10 lines. with corpus.open('w') as fp: fp.write('hello world!\n' * 10) assert builder._total_lines_in_file(corpus) == 10 # Test for the case of 100 lines. with corpus.open('w') as fp: fp.write('hello world!\n' * 100) assert builder._total_lines_in_file(corpus) == 100 # Test for the case of 1548 lines. with corpus.open('w') as fp: fp.write('hello world!\n' * 1548) assert builder._total_lines_in_file(corpus) == 1548
def test_training_wordpiece_tokenizer(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write(_dummy_corpus_content) # Train WordPiece tokenizer and get vocabulary file. vocab = (TrainTokenizer(vocab_size=128, limit_alphabet=64, unk_token='[UNK]').build(afm, corpus)) # Read subwords from the vocabulary file. with vocab.open('r') as fp: words = fp.readlines() # Check if the number of total words equals to vocabulary size and the # vocabulary contains unknown token. assert len(words) == 128 assert words[0].strip() == '[UNK]'
def test_afm_ignores_locked_files_in_clearing(): with tempfile.TemporaryDirectory() as tdir: with AuxiliaryFileManager(f'{tdir}/workspace') as afm: for _ in range(10): afm.create() assert _number_of_files_in_directory(f'{tdir}/workspace') == 10 for _ in range(5): afm.create().lock() assert _number_of_files_in_directory(f'{tdir}/workspace') == 15 # All auxiliary files except locked ones would be removed. Note # that the remainders are unlocked at this point. afm.clear() assert _number_of_files_in_directory(f'{tdir}/workspace') == 5 # As mentioned above, all unlocked files would be remove at this # point. afm.clear() assert _number_of_files_in_directory(f'{tdir}/workspace') == 0 assert not os.path.exists(f'{tdir}/workspace')
def run(self, parent: str): """Execute the builder. All builders can be executed directly and independently, without any input auxiliary files. We recommend to execute builders with miscellaneous ones (e.g. :class:`ImportFrom <langumo.building.miscellaneous.ImportFrom>` and :class:`ExportTo <langumo.building.miscellaneous.ExportTo>`) to pass build inputs correctly. Args: parent: parent workspace directory which will be used for containing all auxiliary files. """ with AuxiliaryFileManager(parent) as afm: self.build(afm) # After running the build pipeline, delete all created dummy files # even though the remainders would be removed in `__exit__` of the # manager. afm.clear()
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def test_afm_handles_files_separately_by_level(): with tempfile.TemporaryDirectory() as tdir: with AuxiliaryFileManager(f'{tdir}/workspace') as afm: # Create auxiliary files in level 0. for _ in range(10): afm.create() assert _number_of_files_in_directory(f'{tdir}/workspace') == 10 with afm.auxiliary_scope(): # Create auxiliary files in level 1. for _ in range(5): afm.create() assert _number_of_files_in_directory(f'{tdir}/workspace') == 15 # Remove the auxiliary files with level 1. afm.clear() assert _number_of_files_in_directory(f'{tdir}/workspace') == 10 # Create auxiliary files and lock some of them. for _ in range(2): afm.create() for _ in range(3): afm.create().lock() assert _number_of_files_in_directory(f'{tdir}/workspace') == 15 with afm.auxiliary_scope(): # Create auxiliary files in level 2. for _ in range(5): afm.create() assert (_number_of_files_in_directory(f'{tdir}/workspace') == 20) # Not only non-locked files but also sub-level auxiliary files # would be removed. afm.clear() assert _number_of_files_in_directory(f'{tdir}/workspace') == 13 afm.clear() assert _number_of_files_in_directory(f'{tdir}/workspace') == 0
def build(self, afm: AuxiliaryFileManager, raw: AuxiliaryFile) -> AuxiliaryFile: parsed = afm.create() self.parser.prepare(raw) # Create processes for parsing texts in parallel and a process for # collecting the parsed texts and saving to the auxiliary file. from_queue, to_queue = Queue(), Queue() parsers = [ Process(target=self._parse_worker, args=(from_queue, to_queue), daemon=True) for _ in range(self.num_workers) ] collector = Process(target=self._collect_worker, args=(parsed, to_queue), daemon=True) # Start the processes. print( colorful.render(f'<r>[*]</r> parse raw-formatted corpus file ' f'with <g>{self.parser.__class__.__name__}</g>')) for p in parsers: p.start() collector.start() # Feed the extracted raw-formatted document to each parser process. for document in self.parser.extract(raw): from_queue.put(document) for _ in range(self.num_workers): from_queue.put(None) # Wait for terminating the processes. for p in parsers: p.join() collector.join() return parsed
def test_formatted_file_parsing(): with tempfile.TemporaryDirectory() as tdir, \ AuxiliaryFileManager(f'{tdir}/workspace') as afm: corpus = afm.create() with corpus.open('w') as fp: fp.write(_dummy_corpus_content) with (ParseRawFile(simple_parser(), lang='en', min_len=16, max_len=512, newline='[NEWLINE]', num_workers=2).build(afm, corpus).open('r')) as fp: assert ({ s.strip() for s in fp } == { 'Wikipedia is a multilingual online encyclopedia created ' 'and maintained as an open collaboration project by a ' 'community of volunteer editors using a wiki-based ' 'editing system. It is the largest and most popular ' 'general reference work on the World Wide Web. It is ' 'also one of the 15 most popular websites ranked by ' 'Alexa, as of August 2020. It features exclusively free ' 'content and no commercial ads. It is hosted by the ' 'Wikimedia Foundation, a non-profit organization funded ' 'primarily through donations. [NEWLINE] Wikipedia was ' 'launched on January 15, 2001, and was created by Jimmy ' 'Wales and Larry Sanger.', 'Sanger coined its name as a portmanteau of the terms ' '"wiki" and "encyclopedia". Initially an ' 'English-language encyclopedia, versions of Wikipedia in ' 'other languages were quickly developed. With 6.1 ' 'million articles, the English Wikipedia is the largest ' 'of the more than 300 Wikipedia encyclopedias. Overall, ' 'Wikipedia comprises more than 54 million articles ' 'attracting 1.5 billion unique visitors per month.', 'In 2005, Nature published a peer review comparing 42 ' 'hard science articles from Encyclopædia Britannica and ' 'Wikipedia and found that Wikipedia\'s level of accuracy ' 'approached that of Britannica, although critics ' 'suggested that it might not have fared so well in a ' 'similar study of a random sampling of all articles or ' 'one focused on social science or contentious social ' 'issues. The following year, Time stated that the ' 'open-door policy of allowing anyone to edit had made ' 'Wikipedia the biggest and possibly the best ' 'encyclopedia in the world, and was a testament to the ' 'vision of Jimmy Wales.', 'Wikipedia has been criticized for exhibiting systemic ' 'bias and for being subject to manipulation and spin in ' 'controversial topics; Edwin Black has criticized ' 'Wikipedia for presenting a mixture of "truth, half ' 'truth, and some falsehoods". Wikipedia has also been ' 'criticized for gender bias, particularly on its ' 'English-language version, where the dominant majority ' 'of editors are male. However, edit-a-thons have been ' 'held to encourage female editors and increase the ' 'coverage of women\'s topics. Facebook announced that by ' '2017 it would help readers detect fake news by ' 'suggesting links to related Wikipedia articles.', 'YouTube announced a similar plan in 2018.' })
def test_afm_context_manager(): with tempfile.TemporaryDirectory() as tdir: with AuxiliaryFileManager(f'{tdir}/workspace'): assert os.path.exists(f'{tdir}/workspace') assert not os.path.exists(f'{tdir}/workspace')