Ejemplo n.º 1
0
def train_huggingface_bpetokenizers(
    data_params: DatasetParams, query_files: List[Path], lang_files: Dict[str,
                                                                          Path]
) -> Tuple[TokenizerRecordable, TokenizerRecordable]:
    logger.info(
        f"Building Query BPETokenizer from query_files {query_files} with do_lowercase:{data_params.do_lowercase} special_tokens:{data_params.special_tokens}"
    )
    query_tokenizer = BPETokenizer()
    query_tokenizer.normalizer = BertNormalizer.new(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=data_params.do_lowercase)
    query_tokenizer.train(files=list(map(str, query_files)),
                          vocab_size=data_params.vocab_size,
                          special_tokens=data_params.special_tokens)

    code_tokenizer = BPETokenizer()
    code_tokenizer.normalizer = BertNormalizer.new(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=data_params.do_lowercase)
    code_tokenizer.train(
        files=list(map(str, lang_files.values())),
        vocab_size=data_params.vocab_size,
        special_tokens=data_params.special_tokens,
    )

    return HuggingfaceBPETokenizerRecordable(
        query_tokenizer), HuggingfaceBPETokenizerRecordable(code_tokenizer)
Ejemplo n.º 2
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 add_special_tokens: bool = True,
                 unk_token: str = "[UNK]",
                 sep_token: str = "[SEP]",
                 cls_token: str = "[CLS]",
                 clean_text: bool = True,
                 handle_chinese_chars: bool = True,
                 strip_accents: bool = True,
                 lowercase: bool = True,
                 wordpieces_prefix: str = "##"):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        tokenizer.add_special_tokens([unk_token, sep_token, cls_token])
        tokenizer.normalizer = BertNormalizer.new(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase)
        tokenizer.pre_tokenizer = BertPreTokenizer.new()

        if add_special_tokens and vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(sep_token)
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(cls_token)
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing.new(
                (sep_token, sep_token_id), (cls_token, cls_token_id))
        tokenizer.decoders = decoders.WordPiece.new(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "add_special_tokens": add_special_tokens,
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 3
0
    tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
    # Use ByteLevel PreTokenizer
    tok_r.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
    # Use ByteLevel Decoder
    tok_r.decoder = decoders.ByteLevel.new()
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer.new(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
    )
    # tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new()
    tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
    tok_r.decoder = decoders.WordPiece.new()
    tok_r.post_processor = BertProcessing.new(
        ("[SEP]", tok_r.token_to_id("[SEP]")),
        ("[CLS]", tok_r.token_to_id("[CLS]")),
    )
else:
    raise Exception(f"Unknown type {args.type}")


def tokenize_r():
    return tok_r.encode_batch(text)