def align_bpe(text: Text, bpe_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]: """Alignment fn for BPE tokenizer, used in GPT and XLM """ eow_tokens = space_tokenize_with_eow(text.lower()) bpe_tokens = bpe_tokenizer.tokenize(text) ta = TokenAligner(eow_tokens, bpe_tokens) return ta, bpe_tokens
def align_bytebpe(text: Text, bytebpe_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]: """Alignment fn for Byte-level BPE tokenizer, used in GPT-2 and RoBERTa """ bow_tokens = space_tokenize_with_bow(text) bytebpe_tokens = bytebpe_tokenizer.tokenize(text) modified_bytebpe_tokens = list(map(process_bytebpe_for_alignment, bytebpe_tokens)) ta = TokenAligner(bow_tokens, modified_bytebpe_tokens) return ta, bytebpe_tokens
def align_sentencepiece( text: Text, sentencepiece_tokenizer: Tokenizer) -> Tuple[TokenAligner, List[Text]]: """Alignment fn for SentencePiece Tokenizer, used in XLNET """ bow_tokens = space_tokenize_with_bow(text) sentencepiece_tokens = sentencepiece_tokenizer.tokenize(text) modified_sentencepiece_tokens = list( map(process_sentencepiece_for_alignment, sentencepiece_tokens)) ta = TokenAligner(bow_tokens, modified_sentencepiece_tokens) return ta, sentencepiece_tokens
def align_wpm( text: Text, wpm_tokenizer: Tokenizer, do_lower_case: bool ) -> Tuple[TokenAligner, List[Text]]: """Alignment fn for WPM tokenizer, used in BERT """ # If using lowercase, do this for the source tokens for better matching. bow_tokens = space_tokenize_with_bow(text.lower() if do_lower_case else text) wpm_tokens = wpm_tokenizer.tokenize(text) # Align using <w> markers for stability w.r.t. word boundaries. modified_wpm_tokens = list(map(process_wordpiece_for_alignment, wpm_tokens)) ta = TokenAligner(bow_tokens, modified_wpm_tokens) return ta, wpm_tokens