def align_bert(text: Text, model_name: str) -> Tuple[retokenize.TokenAligner, List[Text]]: # If using lowercase, do this for the source tokens for better matching. do_lower_case = model_name.endswith('uncased') bow_tokens = space_tokenize_with_bow( text.lower() if do_lower_case else text) bert_tokenizer = _get_bert_tokenizer(model_name, do_lower_case) wpm_tokens = bert_tokenizer.tokenize(text) # Align using <w> markers for stability w.r.t. word boundaries. modified_wpm_tokens = list( map(process_bert_wordpiece_for_alignment, wpm_tokens)) ta = retokenize.TokenAligner(bow_tokens, modified_wpm_tokens) return ta, wpm_tokens
def retokenize_record(record): """Retokenize edge probing examples. Modifies in-place. This can be slow, so recommended to use as a pre-processing step. See retokenize_edge_data.py. """ text = record['text'] moses_tokens = utils.TOKENIZER.tokenize(text) cleaned_moses_tokens = utils.unescape_moses(moses_tokens) ta = retokenize.TokenAligner(text, cleaned_moses_tokens) record['text'] = " ".join(moses_tokens) for target in record['targets']: if 'span1' in target: target['span1'] = list(map(int, ta.project_span(*target['span1']))) if 'span2' in target: target['span2'] = list(map(int, ta.project_span(*target['span2']))) return record
def retokenize_record(record): """Retokenize edge probing examples. Modifies in-place. This can be slow, so recommended to use as a pre-processing step. See retokenize_edge_data.py. """ text = record['text'] eow_tokens = space_tokenize_with_eow(text) bpe_tokens = openai_utils.tokenize(text) ta = retokenize.TokenAligner(eow_tokens, bpe_tokens) record['text'] = " ".join(bpe_tokens) for target in record['targets']: if 'span1' in target: target['span1'] = list(map(int, ta.project_span(*target['span1']))) if 'span2' in target: target['span2'] = list(map(int, ta.project_span(*target['span2']))) return record
def align_openai(text: Text) -> Tuple[retokenize.TokenAligner, List[Text]]: eow_tokens = space_tokenize_with_eow(text) bpe_tokens = openai_utils.tokenize(text) ta = retokenize.TokenAligner(eow_tokens, bpe_tokens) return ta, bpe_tokens
def align_moses(text: Text) -> Tuple[retokenize.TokenAligner, List[Text]]: moses_tokens = MosesTokenizer.tokenize(text) cleaned_moses_tokens = utils.unescape_moses(moses_tokens) ta = retokenize.TokenAligner(text, cleaned_moses_tokens) return ta, moses_tokens