Exemple #1
0
 def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
           ) -> AuxiliaryFile:
     # Note that an imported files would be wrapped with `AuxiliaryFile`
     # directly. Because the files are not created by `AuxiliaryFileManager`
     # but brought simply from existing external files, they do not need to
     # be removed. Namely, the manager does not have the ownership of them.
     files = []
     for path in self.paths:
         print(colorful.render(f'<r>[*]</r> import file from '
                               f'<b>{path}</b>'))
         files.append(AuxiliaryFile(path))
     return tuple(files)
Exemple #2
0
    def __init__(self, lang: str):
        self.lang = lang
        self._splitter = None

        # Prepare prerequisite resources for sentence tokenizers.
        if lang == 'en':
            import nltk

            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                print(
                    colorful.render('<r>[*]</r> prepare resources for '
                                    '<b>splitting sentences</b>'))

                nltk.download('punkt')
Exemple #3
0
    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> AuxiliaryFile:
        merged = afm.create()

        print(colorful.render(f'<r>[*]</r> merge <m>{len(inputs)}</m> files '
                              f'into one'))
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(inputs, 'rb') as srcs:
            for src in srcs:
                for line in src:
                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    dst.write(line)

        return merged
Exemple #4
0
    def build(self, afm: AuxiliaryFileManager, *inputs: AuxiliaryFile
              ) -> Union[None, AuxiliaryFile, Tuple[AuxiliaryFile, ...]]:
        if len(inputs) != len(self.paths):
            raise ValueError('number of predifined exporting files are not '
                             'matched to the given auxiliary files.')

        # Save the auxiliary files to the exporting paths.
        for af, path in zip(inputs, self.paths):
            # Create an exporting directory if not exist.
            parent = os.path.dirname(path)
            if parent != '' and not os.path.exists(parent):
                os.makedirs(parent, exist_ok=True)

            print(colorful.render(f'<r>[*]</r> export the processed file to '
                                  f'<b>{path}</b>'))
            shutil.copyfile(af.name, path)

        return inputs
Exemple #5
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        # Calculate the optimum stride and bucket size.
        stride = max(1,
                     self._total_lines_in_file(corpus) // self.best_seek_cnt)
        buckets = [
            afm.create() for _ in range(min(stride * 2, self.max_buckets))
        ]

        # Collect the corresponding seeking positions and shuffle them.
        offsets = self._collect_seek_offsets(corpus, stride)
        random.shuffle(offsets)

        with corpus.open('rb') as src, \
                AuxiliaryFile.opens(buckets, 'wb') as dsts:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(
                offsets,
                desc=colorful.render('<r>[*]</r> shuffle raw corpus file'))

            for offset in tqdm_iter:
                src.seek(offset)

                for _ in range(stride):
                    line = src.readline()
                    if not line:
                        break

                    # Add break-line character to the end of text to avoid
                    # being merged with other line.
                    line += b'\n' if not line.endswith(b'\n') else b''

                    # Write the decorated text line to the random bucket for
                    # ensuring randomness.
                    dsts[random.randint(0, len(dsts) - 1)].write(line)

        # After splitting to the buckets, merge them into a single file.
        merged = afm.create()
        with merged.open('wb') as dst, \
                AuxiliaryFile.opens(buckets, 'rb') as srcs:
            for src in srcs:
                shutil.copyfileobj(src, dst)

        return merged
Exemple #6
0
def test_colorful_rendering():
    assert colorful.render('hello world') == f'hello world{R}'

    assert (colorful.render('hello <k>world!</k>') ==
            f'hello {R}{F.BLACK}world!{R}{R}')
    assert (colorful.render('<r>hello</r> world!') ==
            f'{R}{F.RED}hello{R} world!{R}')

    assert (colorful.render('<g>hello</g> <y>world!</y>') ==
            f'{R}{F.GREEN}hello{R} {R}{F.YELLOW}world!{R}{R}')
    assert (colorful.render('<b>hello</b> <m>world!</m>') ==
            f'{R}{F.BLUE}hello{R} {R}{F.MAGENTA}world!{R}{R}')

    assert (colorful.render('<c></c>hello<w> </w>world!') ==
            f'{R}{F.CYAN}{R}hello{R}{F.WHITE} {R}world!{R}')
Exemple #7
0
    def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile,
              vocab: AuxiliaryFile) -> AuxiliaryFile:
        total_lines = self._total_lines_in_file(corpus)

        # Create WordPiece model and add special tokens. Note that `unk_token`
        # is also a special token.
        tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token))
        tokenizer.add_special_tokens(self.special_tokens + [self.unk_token])

        # Use BERT-specific normalizer, pre-tokenizer and decoder.
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = WordPieceDecoder(prefix='##')

        tokenized = afm.create()
        with corpus.open('r') as src, tokenized.open('w') as dst:
            # Create tqdm progress bar with colorful description.
            tqdm_iter = tqdm.tqdm(src,
                                  desc=colorful.render(
                                      '<r>[*]</r> tokenize sentences with '
                                      '<g>WordPiece</g> model'),
                                  total=total_lines)

            batch_lines = []
            for line in tqdm_iter:
                batch_lines.append(line)

                # Encode the grouped batch sentences and write the tokenized
                # sentences to the auxiliary output file.
                if len(batch_lines) > self.batch_size:
                    for t in tokenizer.encode_batch(batch_lines):
                        dst.write(' '.join(t.tokens) + '\n')
                    batch_lines.clear()

            # Encode the remainders and write to the output file.
            if batch_lines:
                for t in tokenizer.encode_batch(batch_lines):
                    dst.write(' '.join(t.tokens) + '\n')

        return tokenized
Exemple #8
0
    def build(self, afm: AuxiliaryFileManager,
              raw: AuxiliaryFile) -> AuxiliaryFile:
        parsed = afm.create()
        self.parser.prepare(raw)

        # Create processes for parsing texts in parallel and a process for
        # collecting the parsed texts and saving to the auxiliary file.
        from_queue, to_queue = Queue(), Queue()
        parsers = [
            Process(target=self._parse_worker,
                    args=(from_queue, to_queue),
                    daemon=True) for _ in range(self.num_workers)
        ]
        collector = Process(target=self._collect_worker,
                            args=(parsed, to_queue),
                            daemon=True)

        # Start the processes.
        print(
            colorful.render(f'<r>[*]</r> parse raw-formatted corpus file '
                            f'with <g>{self.parser.__class__.__name__}</g>'))

        for p in parsers:
            p.start()
        collector.start()

        # Feed the extracted raw-formatted document to each parser process.
        for document in self.parser.extract(raw):
            from_queue.put(document)
        for _ in range(self.num_workers):
            from_queue.put(None)

        # Wait for terminating the processes.
        for p in parsers:
            p.join()
        collector.join()

        return parsed
Exemple #9
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> Tuple[AuxiliaryFile, AuxiliaryFile]:
        train_dataset = afm.create()
        val_dataset = afm.create()

        total_lines = self._total_lines_in_file(corpus)
        print(
            colorful.render(f'<r>[*]</r> split validation corpus - '
                            f'<m>{math.ceil(total_lines * self.val_ratio)}'
                            f'</m> of <m>{total_lines}</m> lines'))

        with corpus.open('rb') as src, train_dataset.open('wb') as tdst, \
                val_dataset.open('wb') as vdst:
            # Write the first `val_ratio` lines to the validation dataset file.
            for i, line in enumerate(src):
                vdst.write(line)
                if i + 1 >= total_lines * self.val_ratio:
                    break

            # After writing the validation dataset, copy the entire rest lines
            # to the train dataset.
            shutil.copyfileobj(src, tdst)

        return train_dataset, val_dataset
Exemple #10
0
def test_rendering_wrong_colors():
    with pytest.raises(ValueError):
        colorful.render('<a>this is a text with wrong color.</a>')