Exemple #1
0
    def normalize_tokenizations(cls, tokenizer, space_tokenization,
                                target_tokenization):
        """See tokenization_normalization.py for details"""
        space_tokenization = [token.lower() for token in space_tokenization]
        modifed_space_tokenization = bow_tag_tokens(space_tokenization)
        modifed_target_tokenization = process_sentencepiece_tokens(
            target_tokenization)

        return modifed_space_tokenization, modifed_target_tokenization
Exemple #2
0
    def normalize_tokenizations(cls, tokenizer, space_tokenization,
                                target_tokenization):
        """See tokenization_normalization.py for details"""
        modifed_space_tokenization = bow_tag_tokens(space_tokenization)
        modifed_target_tokenization = ["Ġ" + target_tokenization[0]
                                       ] + target_tokenization[1:]
        modifed_target_tokenization = process_bytebpe_tokens(
            modifed_target_tokenization)

        return modifed_space_tokenization, modifed_target_tokenization
Exemple #3
0
    def normalize_tokenizations(cls, tokenizer, space_tokenization,
                                target_tokenization):
        """See tokenization_normalization.py for details"""
        if tokenizer.init_kwargs.get("do_lower_case", False):
            space_tokenization = [
                token.lower() for token in space_tokenization
            ]
        modifed_space_tokenization = bow_tag_tokens(space_tokenization)
        modifed_target_tokenization = process_wordpiece_tokens(
            target_tokenization)

        return modifed_space_tokenization, modifed_target_tokenization