def tokenize_python_string(s): s = s.replace("a", "aa") s = s.replace(" ", " ab ") s = s.replace(" ", " ") s = sacrebleu.tokenize_v14_international(s) # "ab" -> " ", "aa" -> "a", "\ x" -> "\x" psuedo_tokens = s.split(" ") tokens = [] escaped = False for t in psuedo_tokens: if t == "": continue real_t = SpecialToken.STR_SPACE if t == "ab" else t.replace("aa", "a") if escaped: tokens.append("\\" + real_t) escaped = False elif real_t == "\\": escaped = True else: tokens.append(real_t) escaped = False return tokens
def normalize(sentence, lowercase: bool = True, tokenizer: str = '13a', return_str: bool = True): if lowercase: sentence = sentence.lower() if tokenizer == "13a": normalized_sent = sacrebleu.tokenize_13a(sentence) elif tokenizer == "intl": normalized_sent = sacrebleu.tokenize_v14_international(sentence) elif tokenizer == "moses": normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False) elif tokenizer == "penn": normalized_sent = sacremoses.MosesTokenizer().penn_tokenize( sentence, return_str=True) else: normalized_sent = sentence if not return_str: normalized_sent = normalized_sent.split() return normalized_sent
def tokenize_line(cls, line: str, tokenization: VizSeqTokenization) -> str: if tokenization == VizSeqTokenization.none: return line elif tokenization == VizSeqTokenization.mteval_13a: return tokenize_13a(line) elif tokenization == VizSeqTokenization.mteval_v14_international: return tokenize_v14_international(line) elif tokenization == VizSeqTokenization.zh: return tokenize_zh(line) elif tokenization == VizSeqTokenization.char: return _tokenize_by_char(line) else: raise ValueError(f'Unknown tokenization {tokenization.name}')
def process_string(tok, char2tok, tok2char, is_comment): if is_comment: tok = re.sub(' +', ' ', tok) tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok) if len(re.sub(r'\W', '', tok)) < 2: return '' tok = tok.replace(' ', ' SPACETOKEN ') for char, special_token in char2tok.items(): tok = tok.replace(char, special_token) if tok.startswith(' STOKEN0'): if tok.endswith('\n'): tok = tok[:-1] tok += ' ENDCOM' tok = tok.replace('\n', ' STRNEWLINE ') tok = tok.replace('\t', ' TABSYMBOL ') tok = re.sub(' +', ' ', tok) tok = tokenize_v14_international(tok) tok = re.sub(' +', ' ', tok) for special_token, char in tok2char.items(): tok = tok.replace(special_token, char) tok = tok.replace('\r', '') return tok
def process_string(tok, char2tok, tok2char, is_comment, do_whole_processing=True): if not (do_whole_processing or is_comment): return tok.replace("\n", "\\n").replace("\r", "") if is_comment: tok = re.sub(" +", " ", tok) tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok) if len(re.sub(r"\W", "", tok)) < 2: return "" tok = replace_general_string_tok(tok) tok = replace_tokens(tok, char2tok) if tok.strip().startswith("STOKEN00"): if " STRNEWLINE " in tok: tok = tok.replace(" STRNEWLINE ", " ENDCOM", 1) else: tok += " ENDCOM" if not do_whole_processing: tok = replace_tokens( tok, {f" {key} ": value for key, value in tok2char.items()}) tok = (tok.replace(" ▁ ", " ").replace(" TABSYMBOL ", "\t").replace( "\\r", "").replace(" STRNEWLINE ", "\\n")) return tok tok = re.sub(" +", " ", tok) tok = tokenize_v14_international(tok) tok = re.sub(" +", " ", tok) tok = tok.replace("\r", "") for special_token, char in tok2char.items(): tok = tok.replace(special_token, char) if tok[0].isalpha(): # for special strings, (e.g. L "s" we should remove the space after L) tok = tok.replace(f"{tok[0]} ", tok[0]) return tok
def process_string(tok, char2tok, tok2char, is_comment): if is_comment: tok = re.sub(" +", " ", tok) tok = re.sub(r"(.)\1\1\1\1+", r"\1\1\1\1\1", tok) if len(re.sub(r"\W", "", tok)) < 2: return "" tok = tok.replace(" ", " ▁ ") for char, special_token in char2tok.items(): tok = tok.replace(char, special_token) if tok.startswith(" STOKEN0"): if tok.endswith("\n"): tok = tok[:-1] tok += " ENDCOM" tok = tok.replace("\n", " STRNEWLINE ") tok = tok.replace("\t", " TABSYMBOL ") tok = re.sub(" +", " ", tok) tok = tokenize_v14_international(tok) tok = re.sub(" +", " ", tok) for special_token, char in tok2char.items(): tok = tok.replace(special_token, char) tok = tok.replace("\r", "") return tok