def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> AuxiliaryFile: # Note that an imported files would be wrapped with `AuxiliaryFile` # directly. Because the files are not created by `AuxiliaryFileManager` # but brought simply from existing external files, they do not need to # be removed. Namely, the manager does not have the ownership of them. files = [] for path in self.paths: print(colorful.render(f'<r>[*]</r> import file from ' f'<b>{path}</b>')) files.append(AuxiliaryFile(path)) return tuple(files)
def __init__(self, lang: str): self.lang = lang self._splitter = None # Prepare prerequisite resources for sentence tokenizers. if lang == 'en': import nltk try: nltk.data.find('tokenizers/punkt') except LookupError: print( colorful.render('<r>[*]</r> prepare resources for ' '<b>splitting sentences</b>')) nltk.download('punkt')
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> AuxiliaryFile: merged = afm.create() print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files ' f'into one')) with merged.open('wb') as dst, \ AuxiliaryFile.opens(inputs, 'rb') as srcs: for src in srcs: for line in src: # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' dst.write(line) return merged
def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile ) -> Union[None, AuxiliaryFile, Tuple[AuxiliaryFile, ...]]: if len(inputs) != len(self.paths): raise ValueError('number of predifined exporting files are not ' 'matched to the given auxiliary files.') # Save the auxiliary files to the exporting paths. for af, path in zip(inputs, self.paths): # Create an exporting directory if not exist. parent = os.path.dirname(path) if parent != '' and not os.path.exists(parent): os.makedirs(parent, exist_ok=True) print(colorful.render(f'<r>[*]</r> export the processed file to ' f'<b>{path}</b>')) shutil.copyfile(af.name, path) return inputs
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: # Calculate the optimum stride and bucket size. stride = max(1, self._total_lines_in_file(corpus) // self.best_seek_cnt) buckets = [ afm.create() for _ in range(min(stride * 2, self.max_buckets)) ] # Collect the corresponding seeking positions and shuffle them. offsets = self._collect_seek_offsets(corpus, stride) random.shuffle(offsets) with corpus.open('rb') as src, \ AuxiliaryFile.opens(buckets, 'wb') as dsts: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm( offsets, desc=colorful.render('<r>[*]</r> shuffle raw corpus file')) for offset in tqdm_iter: src.seek(offset) for _ in range(stride): line = src.readline() if not line: break # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' # Write the decorated text line to the random bucket for # ensuring randomness. dsts[random.randint(0, len(dsts) - 1)].write(line) # After splitting to the buckets, merge them into a single file. merged = afm.create() with merged.open('wb') as dst, \ AuxiliaryFile.opens(buckets, 'rb') as srcs: for src in srcs: shutil.copyfileobj(src, dst) return merged
def test_colorful_rendering(): assert colorful.render('hello world') == f'hello world{R}' assert (colorful.render('hello <k>world!</k>') == f'hello {R}{F.BLACK}world!{R}{R}') assert (colorful.render('<r>hello</r> world!') == f'{R}{F.RED}hello{R} world!{R}') assert (colorful.render('<g>hello</g> <y>world!</y>') == f'{R}{F.GREEN}hello{R} {R}{F.YELLOW}world!{R}{R}') assert (colorful.render('<b>hello</b> <m>world!</m>') == f'{R}{F.BLUE}hello{R} {R}{F.MAGENTA}world!{R}{R}') assert (colorful.render('<c></c>hello<w> </w>world!') == f'{R}{F.CYAN}{R}hello{R}{F.WHITE} {R}world!{R}')
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def build(self, afm: AuxiliaryFileManager, raw: AuxiliaryFile) -> AuxiliaryFile: parsed = afm.create() self.parser.prepare(raw) # Create processes for parsing texts in parallel and a process for # collecting the parsed texts and saving to the auxiliary file. from_queue, to_queue = Queue(), Queue() parsers = [ Process(target=self._parse_worker, args=(from_queue, to_queue), daemon=True) for _ in range(self.num_workers) ] collector = Process(target=self._collect_worker, args=(parsed, to_queue), daemon=True) # Start the processes. print( colorful.render(f'<r>[*]</r> parse raw-formatted corpus file ' f'with <g>{self.parser.__class__.__name__}</g>')) for p in parsers: p.start() collector.start() # Feed the extracted raw-formatted document to each parser process. for document in self.parser.extract(raw): from_queue.put(document) for _ in range(self.num_workers): from_queue.put(None) # Wait for terminating the processes. for p in parsers: p.join() collector.join() return parsed
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]: train_dataset = afm.create() val_dataset = afm.create() total_lines = self._total_lines_in_file(corpus) print( colorful.render(f'<r>[*]</r> split validation corpus - ' f'<m>{math.ceil(total_lines * self.val_ratio)}' f'</m> of <m>{total_lines}</m> lines')) with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \ val_dataset.open('wb') as vdst: # Write the first `val_ratio` lines to the validation dataset file. for i, line in enumerate(src): vdst.write(line) if i + 1 >= total_lines * self.val_ratio: break # After writing the validation dataset, copy the entire rest lines # to the train dataset. shutil.copyfileobj(src, tdst) return train_dataset, val_dataset
def test_rendering_wrong_colors(): with pytest.raises(ValueError): colorful.render('<a>this is a text with wrong color.</a>')