コード例 #1
0
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [48, 25, 21, 1289])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this",
            "▁is", "▁fal", "s", "é", "."
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and",
                "▁this", "▁is", "▁fal", "s", "<unk>", "."
            ],
        )
コード例 #2
0
    def run(self):
        """
        CORPUS/xxx.txt, DATA/xxx.vocab, DATA/xxx.model -> DATA/xxx.txt
        """
        vocab_file = os.path.join(ACE_ROOT,
                                  '%s.vocab' % self.config.model_prefix)
        model_file = os.path.join(ACE_ROOT,
                                  '%s.model' % self.config.model_prefix)

        self.__create_text()
        if not os.path.isfile(vocab_file) or not os.path.isfile(model_file):
            self.__create_vocab()

        tokenizer = AlbertTokenizer(vocab_file=vocab_file,
                                    model_file=model_file,
                                    do_lower_case=self.config.do_lower_case,
                                    remove_space=self.config.remove_space,
                                    keep_accents=self.config.keep_accents,
                                    bos_token=self.config.bos_token,
                                    eos_token=self.config.eos_token,
                                    unk_token=self.config.unk_token,
                                    sep_token=self.config.sep_token,
                                    pad_token=self.config.pad_token,
                                    cls_token=self.config.cls_token,
                                    mask_token=self.config.mask_token)
        for text_file in tqdm(FileUtil.file_list(self.config.corpus_dir),
                              desc='create pretraining data files'):
            if text_file.endswith('.txt'):
                data_file = os.path.join(self.config.data_dir,
                                         os.path.basename(text_file))
                with open(text_file, 'r') as f, open(data_file, 'w') as fw:
                    for line in f.read().splitlines():
                        tokens = tokenizer.tokenize(line)
                        fw.write(' '.join(tokens) + '\n')
コード例 #3
0
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [48, 25, 21, 1289])

        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and',
            u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, [
            '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this',
            '▁is', '▁fal', 's', '<unk>', '.'
        ])