def extract(self, raw: AuxiliaryFile) -> Iterable[str]: with raw.open('r') as fp: for line in fp: if not line.strip(): continue yield line
def _collect_worker(self, parsed: AuxiliaryFile, to_queue: Queue): terminated = 0 with parsed.open('w') as fp: while terminated < self.num_workers: text = to_queue.get() if text is None: terminated += 1 continue text += '\n' if not text.endswith('\n') else '' fp.write(text)
def extract(self, raw: AuxiliaryFile) -> Iterable[str]: with raw.open('r') as fp: for prefix, event, value in ijson.parse(fp): if not prefix.endswith('.text'): continue # Skip the redirection pages. if value.lower().strip().startswith('#redirect'): continue yield value
def _create_subset_file(self, afm: AuxiliaryFileManager, af: AuxiliaryFile) -> AuxiliaryFile: subset = afm.create() with af.open('rb') as src, subset.open('wb') as dst: while True: line = src.readline() if not line: break dst.write(line) # If total amount of copied data is more than `subset_size` # then stop copying data to the subset file. if src.tell() > self.subset_size: break return subset
def _collect_seek_offsets(self, af: AuxiliaryFile, stride: int) -> List[int]: offsets = [] with af.open('rb') as fp: while True: current = fp.tell() # Read `stride` lines and move to the end of the chunk. If the # last line in the chunk is empty, then it means current is the # last of entire chunks. lines = [fp.readline() for _ in range(stride)] if not lines[-1]: break # Gather the current position to the collection. offsets.append(current) return offsets
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: # Calculate the optimum stride and bucket size. stride = max(1, self._total_lines_in_file(corpus) // self.best_seek_cnt) buckets = [ afm.create() for _ in range(min(stride * 2, self.max_buckets)) ] # Collect the corresponding seeking positions and shuffle them. offsets = self._collect_seek_offsets(corpus, stride) random.shuffle(offsets) with corpus.open('rb') as src, \ AuxiliaryFile.opens(buckets, 'wb') as dsts: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm( offsets, desc=colorful.render('<r>[*]</r> shuffle raw corpus file')) for offset in tqdm_iter: src.seek(offset) for _ in range(stride): line = src.readline() if not line: break # Add break-line character to the end of text to avoid # being merged with other line. line += b'\n' if not line.endswith(b'\n') else b'' # Write the decorated text line to the random bucket for # ensuring randomness. dsts[random.randint(0, len(dsts) - 1)].write(line) # After splitting to the buckets, merge them into a single file. merged = afm.create() with merged.open('wb') as dst, \ AuxiliaryFile.opens(buckets, 'rb') as srcs: for src in srcs: shutil.copyfileobj(src, dst) return merged
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]: train_dataset = afm.create() val_dataset = afm.create() total_lines = self._total_lines_in_file(corpus) print( colorful.render(f'<r>[*]</r> split validation corpus - ' f'<m>{math.ceil(total_lines * self.val_ratio)}' f'</m> of <m>{total_lines}</m> lines')) with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \ val_dataset.open('wb') as vdst: # Write the first `val_ratio` lines to the validation dataset file. for i, line in enumerate(src): vdst.write(line) if i + 1 >= total_lines * self.val_ratio: break # After writing the validation dataset, copy the entire rest lines # to the train dataset. shutil.copyfileobj(src, tdst) return train_dataset, val_dataset
def _total_lines_in_file(self, af: AuxiliaryFile) -> int: total_lines = 0 with af.open('rb') as fp: for _ in fp: total_lines += 1 return total_lines
def extract(self, raw: AuxiliaryFile) -> Iterable[str]: with raw.open('r') as fp: articles = fp.read().split('\n\n') yield from articles