def train_huggingface_bpetokenizers( data_params: DatasetParams, query_files: List[Path], lang_files: Dict[str, Path] ) -> Tuple[TokenizerRecordable, TokenizerRecordable]: logger.info( f"Building Query BPETokenizer from query_files {query_files} with do_lowercase:{data_params.do_lowercase} special_tokens:{data_params.special_tokens}" ) query_tokenizer = BPETokenizer() query_tokenizer.normalizer = BertNormalizer.new( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=data_params.do_lowercase) query_tokenizer.train(files=list(map(str, query_files)), vocab_size=data_params.vocab_size, special_tokens=data_params.special_tokens) code_tokenizer = BPETokenizer() code_tokenizer.normalizer = BertNormalizer.new( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=data_params.do_lowercase) code_tokenizer.train( files=list(map(str, lang_files.values())), vocab_size=data_params.vocab_size, special_tokens=data_params.special_tokens, ) return HuggingfaceBPETokenizerRecordable( query_tokenizer), HuggingfaceBPETokenizerRecordable(code_tokenizer)
def __init__(self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##"): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer.new( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase) tokenizer.pre_tokenizer = BertPreTokenizer.new() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing.new( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece.new(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel.new() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer.new( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new() tok_r.decoder = decoders.WordPiece.new() tok_r.post_processor = BertProcessing.new( ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ) else: raise Exception(f"Unknown type {args.type}") def tokenize_r(): return tok_r.encode_batch(text)