Exemple #1
0
    def test_instantiate(self):
        bad = PreTokenizer.custom(TestCustomPreTokenizer.BadCustomPretok())
        good = PreTokenizer.custom(TestCustomPreTokenizer.GoodCustomPretok())

        assert isinstance(bad, PreTokenizer)
        assert isinstance(good, PreTokenizer)
        with pytest.raises(Exception, match="TypeError: pre_tokenize()"):
            bad.pre_tokenize_str("Hey there!")
        assert good.pre_tokenize_str("Hey there!") == [
            ("Hey there!", (0, 10)),
            ("Hey there!", (0, 10)),
        ]
Exemple #2
0
def load_custom_tokenizer(tokenizer_file: str) -> Tokenizer:
    """ Tokenizerのロード処理:tokenizer.json からTokenizerをロードし、custome PreTokenizerをセットする。
    """
    tokenizer = Tokenizer.from_file(tokenizer_file)
    # ダミー注入したRustベースのPreTokenizerを、custom PreTokenizerで上書き。
    tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer())
    return tokenizer
Exemple #3
0
def train_custom_tokenizer(files: List[str], tokenizer_file: str,
                           **kwargs) -> BertWordPieceTokenizer:
    """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。
    """

    tokenizer = BertWordPieceTokenizer(
        handle_chinese_chars=False,  # for japanese
        strip_accents=False,  # for japanese
    )
    tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(
        MecabPreTokenizer())

    # 与えられたコーパスファイル集合からサブワード分割を学習
    tokenizer.train(files, **kwargs)

    # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存
    # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ
    # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。
    tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.save(tokenizer_file)

    # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合)
    filename = "wordpiece"
    model_files = tokenizer._tokenizer.model.save(
        str(pathlib.Path(tokenizer_file).parent), filename)

    return tokenizer
Exemple #4
0
def custom_tokenizer_from_pretrained(
        tokenizer_file_or_name: str,
        cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast:
    """Load BertWordPieceTokenizer from tokenizer.json.
    This is necessary due to the following reasons:
    - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method
    - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer.
    """

    if os.path.exists(tokenizer_file_or_name):
        if not os.path.isdir(tokenizer_file_or_name):
            tokenizer_dir = os.path.dirname(tokenizer_file_or_name)
            pt_tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_dir,
                cache_dir=cache_dir,
            )

            # This is necessary for pt_tokenizer.save_pretrained(save_path)
            _tokenizer = Tokenizer.from_file(tokenizer_file_or_name)
            _tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer())
            pt_tokenizer._tokenizer = _tokenizer
        else:
            pt_tokenizer = AutoTokenizer.from_pretrained(
                tokenizer_file_or_name,
                cache_dir=cache_dir,
            )
    else:
        # trf>=4.0.0: PreTrainedTokenizerFast by default
        # NOTE: AutoTokenizer doesn't load PreTrainedTokenizerFast...
        pt_tokenizer = BertTokenizerFast.from_pretrained(
            tokenizer_file_or_name,
            cache_dir=cache_dir,
        )
    return pt_tokenizer
Exemple #5
0
def load_janome_tokenizer(tokenizer_path) -> Tokenizer:
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    tokenizer.pre_tokenizer = Sequence([
        Whitespace(),
        PreTokenizer.custom(JanomePreTokenizer()),
    ])
    tokenizer.decoder = Decoder.custom(JanomeDecoder())
    return tokenizer
Exemple #6
0
    def test_camel_case(self):
        class CamelCasePretok:
            def get_state(self, c):
                if c.islower():
                    return "lower"
                elif c.isupper():
                    return "upper"
                elif c.isdigit():
                    return "digit"
                else:
                    return "rest"

            def split(self, n, normalized):
                i = 0
                # states = {"any", "lower", "upper", "digit", "rest"}
                state = "any"
                pieces = []
                for j, c in enumerate(normalized.normalized):
                    c_state = self.get_state(c)
                    if state == "any":
                        state = c_state
                    if state != "rest" and state == c_state:
                        pass
                    elif state == "upper" and c_state == "lower":
                        pass
                    else:
                        pieces.append(normalized[i:j])
                        i = j
                    state = c_state
                pieces.append(normalized[i:])
                return pieces

            def pre_tokenize(self, pretok):
                pretok.split(self.split)

        camel = PreTokenizer.custom(CamelCasePretok())

        assert camel.pre_tokenize_str("HeyThere!?-ThisIsLife") == [
            ("Hey", (0, 3)),
            ("There", (3, 8)),
            ("!", (8, 9)),
            ("?", (9, 10)),
            ("-", (10, 11)),
            ("This", (11, 15)),
            ("Is", (15, 17)),
            ("Life", (17, 21)),
        ]
Exemple #7
0
def build_hf_tokenizer(kmer_length: int,
                       kmer_stride: int,
                       alphabet: str,
                       unk_token: str = "?") -> Tokenizer:
    """Build a full huggingface tokenizer from the inputs.

    Note:
        Same arguments taken as KmerPreTokenizer.

    """
    kmer_pre = KmerPreTokenizer(kmer_length=kmer_length,
                                kmer_stride=kmer_stride,
                                alphabet=alphabet,
                                unk_token=unk_token)
    tokenizer = Tokenizer(
        models.WordLevel(vocab=kmer_pre.kmer_tokenzier.vocab.stoi,
                         unk_token=unk_token))
    tokenizer.pre_tokenizer = PreTokenizer.custom(kmer_pre)
    tokenizer.decoder = ByteLevel()

    return tokenizer
Exemple #8
0
def load_pretrained_tokenizer(
        tokenizer_file: str,
        cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast:
    """Load BertWordPieceTokenizer from tokenizer.json.
    This is necessary due to the following reasons:
    - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method
    - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer.
    """
    tokenizer = Tokenizer.from_file(tokenizer_file)
    tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer())

    tokenizer_dir = os.path.dirname(tokenizer_file)
    pt_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
        tokenizer_dir,
        cache_dir=cache_dir,
    )

    # This is necessary for pt_tokenizer.save_pretrained(save_path)
    pt_tokenizer._tokenizer = tokenizer  # ._tokenizer

    return pt_tokenizer
class CustomNormalizer:
    def normalize(self, normalized: NormalizedString):
        # Most of these can be replaced by a `Sequence` combining some provided Normalizer,
        # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
        # and it should be the prefered way. That being said, here is an example of the kind
        # of things that can be done here:
        normalized.nfkc()
        normalized.filter(lambda char: not char.isnumeric())
        normalized.replace(Regex("\s+"), " ")
        normalized.lowercase()


# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(BPE())
tok.normalizer = Normalizer.custom(CustomNormalizer())
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
tok.decoder = Decoder.custom(CustomDecoder())

input = "永和服装饰品有限公司"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]

input = "112233"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))]

input = "1234 ℌ𝔢𝔩𝔩𝔬    𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣    𝕗𝕣𝕚𝕖𝕟𝕕!"
print("Normalize:", input)
print(tok.normalizer.normalize_str(input))
Exemple #10
0
def load_jieba_tokenizer(tokenizer_path) -> Tokenizer:
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
    tokenizer.decoder = Decoder.custom(JiebaDecoder())
    return tokenizer
Exemple #11
0
 def __setstate__(self, d):
     self.__dict__ = d
     vocab = self.__dict__["_tokenizer"].get_vocab()
     self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(
         JiebaPreTokenizer(vocab))